1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 20 #include "llvm/ADT/DenseMap.h" 21 #include "llvm/ADT/DenseSet.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/PriorityQueue.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/ADT/SetOperations.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallBitVector.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallString.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/iterator.h" 33 #include "llvm/ADT/iterator_range.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/CodeMetrics.h" 37 #include "llvm/Analysis/DemandedBits.h" 38 #include "llvm/Analysis/GlobalsModRef.h" 39 #include "llvm/Analysis/IVDescriptors.h" 40 #include "llvm/Analysis/LoopAccessAnalysis.h" 41 #include "llvm/Analysis/LoopInfo.h" 42 #include "llvm/Analysis/MemoryLocation.h" 43 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 44 #include "llvm/Analysis/ScalarEvolution.h" 45 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 46 #include "llvm/Analysis/TargetLibraryInfo.h" 47 #include "llvm/Analysis/TargetTransformInfo.h" 48 #include "llvm/Analysis/ValueTracking.h" 49 #include "llvm/Analysis/VectorUtils.h" 50 #include "llvm/IR/Attributes.h" 51 #include "llvm/IR/BasicBlock.h" 52 #include "llvm/IR/Constant.h" 53 #include "llvm/IR/Constants.h" 54 #include "llvm/IR/DataLayout.h" 55 #include "llvm/IR/DerivedTypes.h" 56 #include "llvm/IR/Dominators.h" 57 #include "llvm/IR/Function.h" 58 #include "llvm/IR/IRBuilder.h" 59 #include "llvm/IR/InstrTypes.h" 60 #include "llvm/IR/Instruction.h" 61 #include "llvm/IR/Instructions.h" 62 #include "llvm/IR/IntrinsicInst.h" 63 #include "llvm/IR/Intrinsics.h" 64 #include "llvm/IR/Module.h" 65 #include "llvm/IR/Operator.h" 66 #include "llvm/IR/PatternMatch.h" 67 #include "llvm/IR/Type.h" 68 #include "llvm/IR/Use.h" 69 #include "llvm/IR/User.h" 70 #include "llvm/IR/Value.h" 71 #include "llvm/IR/ValueHandle.h" 72 #ifdef EXPENSIVE_CHECKS 73 #include "llvm/IR/Verifier.h" 74 #endif 75 #include "llvm/Pass.h" 76 #include "llvm/Support/Casting.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/DOTGraphTraits.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/ErrorHandling.h" 82 #include "llvm/Support/GraphWriter.h" 83 #include "llvm/Support/InstructionCost.h" 84 #include "llvm/Support/KnownBits.h" 85 #include "llvm/Support/MathExtras.h" 86 #include "llvm/Support/raw_ostream.h" 87 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 88 #include "llvm/Transforms/Utils/Local.h" 89 #include "llvm/Transforms/Utils/LoopUtils.h" 90 #include <algorithm> 91 #include <cassert> 92 #include <cstdint> 93 #include <iterator> 94 #include <memory> 95 #include <optional> 96 #include <set> 97 #include <string> 98 #include <tuple> 99 #include <utility> 100 #include <vector> 101 102 using namespace llvm; 103 using namespace llvm::PatternMatch; 104 using namespace slpvectorizer; 105 106 #define SV_NAME "slp-vectorizer" 107 #define DEBUG_TYPE "SLP" 108 109 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 110 111 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, 112 cl::desc("Run the SLP vectorization passes")); 113 114 static cl::opt<int> 115 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 116 cl::desc("Only vectorize if you gain more than this " 117 "number ")); 118 119 static cl::opt<bool> 120 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 121 cl::desc("Attempt to vectorize horizontal reductions")); 122 123 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 124 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 125 cl::desc( 126 "Attempt to vectorize horizontal reductions feeding into a store")); 127 128 // NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run 129 // even if we match a reduction but do not vectorize in the end. 130 static cl::opt<bool> AllowHorRdxIdenityOptimization( 131 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, 132 cl::desc("Allow optimization of original scalar identity operations on " 133 "matched horizontal reductions.")); 134 135 static cl::opt<int> 136 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 137 cl::desc("Attempt to vectorize for this register size in bits")); 138 139 static cl::opt<unsigned> 140 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, 141 cl::desc("Maximum SLP vectorization factor (0=unlimited)")); 142 143 static cl::opt<int> 144 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, 145 cl::desc("Maximum depth of the lookup for consecutive stores.")); 146 147 /// Limits the size of scheduling regions in a block. 148 /// It avoid long compile times for _very_ large blocks where vector 149 /// instructions are spread over a wide range. 150 /// This limit is way higher than needed by real-world functions. 151 static cl::opt<int> 152 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 153 cl::desc("Limit the size of the SLP scheduling region per block")); 154 155 static cl::opt<int> MinVectorRegSizeOption( 156 "slp-min-reg-size", cl::init(128), cl::Hidden, 157 cl::desc("Attempt to vectorize for this register size in bits")); 158 159 static cl::opt<unsigned> RecursionMaxDepth( 160 "slp-recursion-max-depth", cl::init(12), cl::Hidden, 161 cl::desc("Limit the recursion depth when building a vectorizable tree")); 162 163 static cl::opt<unsigned> MinTreeSize( 164 "slp-min-tree-size", cl::init(3), cl::Hidden, 165 cl::desc("Only vectorize small trees if they are fully vectorizable")); 166 167 // The maximum depth that the look-ahead score heuristic will explore. 168 // The higher this value, the higher the compilation time overhead. 169 static cl::opt<int> LookAheadMaxDepth( 170 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, 171 cl::desc("The maximum look-ahead depth for operand reordering scores")); 172 173 // The maximum depth that the look-ahead score heuristic will explore 174 // when it probing among candidates for vectorization tree roots. 175 // The higher this value, the higher the compilation time overhead but unlike 176 // similar limit for operands ordering this is less frequently used, hence 177 // impact of higher value is less noticeable. 178 static cl::opt<int> RootLookAheadMaxDepth( 179 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, 180 cl::desc("The maximum look-ahead depth for searching best rooting option")); 181 182 static cl::opt<bool> 183 ViewSLPTree("view-slp-tree", cl::Hidden, 184 cl::desc("Display the SLP trees with Graphviz")); 185 186 // Limit the number of alias checks. The limit is chosen so that 187 // it has no negative effect on the llvm benchmarks. 188 static const unsigned AliasedCheckLimit = 10; 189 190 // Another limit for the alias checks: The maximum distance between load/store 191 // instructions where alias checks are done. 192 // This limit is useful for very large basic blocks. 193 static const unsigned MaxMemDepDistance = 160; 194 195 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 196 /// regions to be handled. 197 static const int MinScheduleRegionSize = 16; 198 199 /// Predicate for the element types that the SLP vectorizer supports. 200 /// 201 /// The most important thing to filter here are types which are invalid in LLVM 202 /// vectors. We also filter target specific types which have absolutely no 203 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 204 /// avoids spending time checking the cost model and realizing that they will 205 /// be inevitably scalarized. 206 static bool isValidElementType(Type *Ty) { 207 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 208 !Ty->isPPC_FP128Ty(); 209 } 210 211 /// \returns True if the value is a constant (but not globals/constant 212 /// expressions). 213 static bool isConstant(Value *V) { 214 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V); 215 } 216 217 /// Checks if \p V is one of vector-like instructions, i.e. undef, 218 /// insertelement/extractelement with constant indices for fixed vector type or 219 /// extractvalue instruction. 220 static bool isVectorLikeInstWithConstOps(Value *V) { 221 if (!isa<InsertElementInst, ExtractElementInst>(V) && 222 !isa<ExtractValueInst, UndefValue>(V)) 223 return false; 224 auto *I = dyn_cast<Instruction>(V); 225 if (!I || isa<ExtractValueInst>(I)) 226 return true; 227 if (!isa<FixedVectorType>(I->getOperand(0)->getType())) 228 return false; 229 if (isa<ExtractElementInst>(I)) 230 return isConstant(I->getOperand(1)); 231 assert(isa<InsertElementInst>(V) && "Expected only insertelement."); 232 return isConstant(I->getOperand(2)); 233 } 234 235 /// \returns true if all of the instructions in \p VL are in the same block or 236 /// false otherwise. 237 static bool allSameBlock(ArrayRef<Value *> VL) { 238 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 239 if (!I0) 240 return false; 241 if (all_of(VL, isVectorLikeInstWithConstOps)) 242 return true; 243 244 BasicBlock *BB = I0->getParent(); 245 for (int I = 1, E = VL.size(); I < E; I++) { 246 auto *II = dyn_cast<Instruction>(VL[I]); 247 if (!II) 248 return false; 249 250 if (BB != II->getParent()) 251 return false; 252 } 253 return true; 254 } 255 256 /// \returns True if all of the values in \p VL are constants (but not 257 /// globals/constant expressions). 258 static bool allConstant(ArrayRef<Value *> VL) { 259 // Constant expressions and globals can't be vectorized like normal integer/FP 260 // constants. 261 return all_of(VL, isConstant); 262 } 263 264 /// \returns True if all of the values in \p VL are identical or some of them 265 /// are UndefValue. 266 static bool isSplat(ArrayRef<Value *> VL) { 267 Value *FirstNonUndef = nullptr; 268 for (Value *V : VL) { 269 if (isa<UndefValue>(V)) 270 continue; 271 if (!FirstNonUndef) { 272 FirstNonUndef = V; 273 continue; 274 } 275 if (V != FirstNonUndef) 276 return false; 277 } 278 return FirstNonUndef != nullptr; 279 } 280 281 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 282 static bool isCommutative(Instruction *I) { 283 if (auto *Cmp = dyn_cast<CmpInst>(I)) 284 return Cmp->isCommutative(); 285 if (auto *BO = dyn_cast<BinaryOperator>(I)) 286 return BO->isCommutative(); 287 // TODO: This should check for generic Instruction::isCommutative(), but 288 // we need to confirm that the caller code correctly handles Intrinsics 289 // for example (does not have 2 operands). 290 return false; 291 } 292 293 /// \returns inserting index of InsertElement or InsertValue instruction, 294 /// using Offset as base offset for index. 295 static std::optional<unsigned> getInsertIndex(const Value *InsertInst, 296 unsigned Offset = 0) { 297 int Index = Offset; 298 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { 299 const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); 300 if (!VT) 301 return std::nullopt; 302 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); 303 if (!CI) 304 return std::nullopt; 305 if (CI->getValue().uge(VT->getNumElements())) 306 return std::nullopt; 307 Index *= VT->getNumElements(); 308 Index += CI->getZExtValue(); 309 return Index; 310 } 311 312 const auto *IV = cast<InsertValueInst>(InsertInst); 313 Type *CurrentType = IV->getType(); 314 for (unsigned I : IV->indices()) { 315 if (const auto *ST = dyn_cast<StructType>(CurrentType)) { 316 Index *= ST->getNumElements(); 317 CurrentType = ST->getElementType(I); 318 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { 319 Index *= AT->getNumElements(); 320 CurrentType = AT->getElementType(); 321 } else { 322 return std::nullopt; 323 } 324 Index += I; 325 } 326 return Index; 327 } 328 329 namespace { 330 /// Specifies the way the mask should be analyzed for undefs/poisonous elements 331 /// in the shuffle mask. 332 enum class UseMask { 333 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, 334 ///< check for the mask elements for the first argument (mask 335 ///< indices are in range [0:VF)). 336 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check 337 ///< for the mask elements for the second argument (mask indices 338 ///< are in range [VF:2*VF)) 339 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for 340 ///< future shuffle elements and mark them as ones as being used 341 ///< in future. Non-undef elements are considered as unused since 342 ///< they're already marked as used in the mask. 343 }; 344 } // namespace 345 346 /// Prepares a use bitset for the given mask either for the first argument or 347 /// for the second. 348 static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, 349 UseMask MaskArg) { 350 SmallBitVector UseMask(VF, true); 351 for (auto [Idx, Value] : enumerate(Mask)) { 352 if (Value == PoisonMaskElem) { 353 if (MaskArg == UseMask::UndefsAsMask) 354 UseMask.reset(Idx); 355 continue; 356 } 357 if (MaskArg == UseMask::FirstArg && Value < VF) 358 UseMask.reset(Value); 359 else if (MaskArg == UseMask::SecondArg && Value >= VF) 360 UseMask.reset(Value - VF); 361 } 362 return UseMask; 363 } 364 365 /// Checks if the given value is actually an undefined constant vector. 366 /// Also, if the \p UseMask is not empty, tries to check if the non-masked 367 /// elements actually mask the insertelement buildvector, if any. 368 template <bool IsPoisonOnly = false> 369 static SmallBitVector isUndefVector(const Value *V, 370 const SmallBitVector &UseMask = {}) { 371 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); 372 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>; 373 if (isa<T>(V)) 374 return Res; 375 auto *VecTy = dyn_cast<FixedVectorType>(V->getType()); 376 if (!VecTy) 377 return Res.reset(); 378 auto *C = dyn_cast<Constant>(V); 379 if (!C) { 380 if (!UseMask.empty()) { 381 const Value *Base = V; 382 while (auto *II = dyn_cast<InsertElementInst>(Base)) { 383 Base = II->getOperand(0); 384 if (isa<T>(II->getOperand(1))) 385 continue; 386 std::optional<unsigned> Idx = getInsertIndex(II); 387 if (!Idx) 388 continue; 389 if (*Idx < UseMask.size() && !UseMask.test(*Idx)) 390 Res.reset(*Idx); 391 } 392 // TODO: Add analysis for shuffles here too. 393 if (V == Base) { 394 Res.reset(); 395 } else { 396 SmallBitVector SubMask(UseMask.size(), false); 397 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask); 398 } 399 } else { 400 Res.reset(); 401 } 402 return Res; 403 } 404 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { 405 if (Constant *Elem = C->getAggregateElement(I)) 406 if (!isa<T>(Elem) && 407 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I)))) 408 Res.reset(I); 409 } 410 return Res; 411 } 412 413 /// Checks if the vector of instructions can be represented as a shuffle, like: 414 /// %x0 = extractelement <4 x i8> %x, i32 0 415 /// %x3 = extractelement <4 x i8> %x, i32 3 416 /// %y1 = extractelement <4 x i8> %y, i32 1 417 /// %y2 = extractelement <4 x i8> %y, i32 2 418 /// %x0x0 = mul i8 %x0, %x0 419 /// %x3x3 = mul i8 %x3, %x3 420 /// %y1y1 = mul i8 %y1, %y1 421 /// %y2y2 = mul i8 %y2, %y2 422 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 423 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 424 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 425 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 426 /// ret <4 x i8> %ins4 427 /// can be transformed into: 428 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 429 /// i32 6> 430 /// %2 = mul <4 x i8> %1, %1 431 /// ret <4 x i8> %2 432 /// We convert this initially to something like: 433 /// %x0 = extractelement <4 x i8> %x, i32 0 434 /// %x3 = extractelement <4 x i8> %x, i32 3 435 /// %y1 = extractelement <4 x i8> %y, i32 1 436 /// %y2 = extractelement <4 x i8> %y, i32 2 437 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 438 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 439 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 440 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 441 /// %5 = mul <4 x i8> %4, %4 442 /// %6 = extractelement <4 x i8> %5, i32 0 443 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 444 /// %7 = extractelement <4 x i8> %5, i32 1 445 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 446 /// %8 = extractelement <4 x i8> %5, i32 2 447 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 448 /// %9 = extractelement <4 x i8> %5, i32 3 449 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 450 /// ret <4 x i8> %ins4 451 /// InstCombiner transforms this into a shuffle and vector mul 452 /// Mask will return the Shuffle Mask equivalent to the extracted elements. 453 /// TODO: Can we split off and reuse the shuffle mask detection from 454 /// ShuffleVectorInst/getShuffleCost? 455 static std::optional<TargetTransformInfo::ShuffleKind> 456 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { 457 const auto *It = 458 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); 459 if (It == VL.end()) 460 return std::nullopt; 461 auto *EI0 = cast<ExtractElementInst>(*It); 462 if (isa<ScalableVectorType>(EI0->getVectorOperandType())) 463 return std::nullopt; 464 unsigned Size = 465 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); 466 Value *Vec1 = nullptr; 467 Value *Vec2 = nullptr; 468 enum ShuffleMode { Unknown, Select, Permute }; 469 ShuffleMode CommonShuffleMode = Unknown; 470 Mask.assign(VL.size(), PoisonMaskElem); 471 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 472 // Undef can be represented as an undef element in a vector. 473 if (isa<UndefValue>(VL[I])) 474 continue; 475 auto *EI = cast<ExtractElementInst>(VL[I]); 476 if (isa<ScalableVectorType>(EI->getVectorOperandType())) 477 return std::nullopt; 478 auto *Vec = EI->getVectorOperand(); 479 // We can extractelement from undef or poison vector. 480 if (isUndefVector(Vec).all()) 481 continue; 482 // All vector operands must have the same number of vector elements. 483 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) 484 return std::nullopt; 485 if (isa<UndefValue>(EI->getIndexOperand())) 486 continue; 487 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); 488 if (!Idx) 489 return std::nullopt; 490 // Undefined behavior if Idx is negative or >= Size. 491 if (Idx->getValue().uge(Size)) 492 continue; 493 unsigned IntIdx = Idx->getValue().getZExtValue(); 494 Mask[I] = IntIdx; 495 // For correct shuffling we have to have at most 2 different vector operands 496 // in all extractelement instructions. 497 if (!Vec1 || Vec1 == Vec) { 498 Vec1 = Vec; 499 } else if (!Vec2 || Vec2 == Vec) { 500 Vec2 = Vec; 501 Mask[I] += Size; 502 } else { 503 return std::nullopt; 504 } 505 if (CommonShuffleMode == Permute) 506 continue; 507 // If the extract index is not the same as the operation number, it is a 508 // permutation. 509 if (IntIdx != I) { 510 CommonShuffleMode = Permute; 511 continue; 512 } 513 CommonShuffleMode = Select; 514 } 515 // If we're not crossing lanes in different vectors, consider it as blending. 516 if (CommonShuffleMode == Select && Vec2) 517 return TargetTransformInfo::SK_Select; 518 // If Vec2 was never used, we have a permutation of a single vector, otherwise 519 // we have permutation of 2 vectors. 520 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc 521 : TargetTransformInfo::SK_PermuteSingleSrc; 522 } 523 524 /// \returns True if Extract{Value,Element} instruction extracts element Idx. 525 static std::optional<unsigned> getExtractIndex(Instruction *E) { 526 unsigned Opcode = E->getOpcode(); 527 assert((Opcode == Instruction::ExtractElement || 528 Opcode == Instruction::ExtractValue) && 529 "Expected extractelement or extractvalue instruction."); 530 if (Opcode == Instruction::ExtractElement) { 531 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 532 if (!CI) 533 return std::nullopt; 534 return CI->getZExtValue(); 535 } 536 auto *EI = cast<ExtractValueInst>(E); 537 if (EI->getNumIndices() != 1) 538 return std::nullopt; 539 return *EI->idx_begin(); 540 } 541 542 /// Tries to find extractelement instructions with constant indices from fixed 543 /// vector type and gather such instructions into a bunch, which highly likely 544 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 545 /// successful, the matched scalars are replaced by poison values in \p VL for 546 /// future analysis. 547 static std::optional<TTI::ShuffleKind> 548 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, 549 SmallVectorImpl<int> &Mask) { 550 // Scan list of gathered scalars for extractelements that can be represented 551 // as shuffles. 552 MapVector<Value *, SmallVector<int>> VectorOpToIdx; 553 SmallVector<int> UndefVectorExtracts; 554 for (int I = 0, E = VL.size(); I < E; ++I) { 555 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 556 if (!EI) { 557 if (isa<UndefValue>(VL[I])) 558 UndefVectorExtracts.push_back(I); 559 continue; 560 } 561 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); 562 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) 563 continue; 564 std::optional<unsigned> Idx = getExtractIndex(EI); 565 // Undefined index. 566 if (!Idx) { 567 UndefVectorExtracts.push_back(I); 568 continue; 569 } 570 SmallBitVector ExtractMask(VecTy->getNumElements(), true); 571 ExtractMask.reset(*Idx); 572 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { 573 UndefVectorExtracts.push_back(I); 574 continue; 575 } 576 VectorOpToIdx[EI->getVectorOperand()].push_back(I); 577 } 578 // Sort the vector operands by the maximum number of uses in extractelements. 579 MapVector<unsigned, SmallVector<Value *>> VFToVector; 580 for (const auto &Data : VectorOpToIdx) 581 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()] 582 .push_back(Data.first); 583 for (auto &Data : VFToVector) { 584 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { 585 return VectorOpToIdx.find(V1)->second.size() > 586 VectorOpToIdx.find(V2)->second.size(); 587 }); 588 } 589 // Find the best pair of the vectors with the same number of elements or a 590 // single vector. 591 const int UndefSz = UndefVectorExtracts.size(); 592 unsigned SingleMax = 0; 593 Value *SingleVec = nullptr; 594 unsigned PairMax = 0; 595 std::pair<Value *, Value *> PairVec(nullptr, nullptr); 596 for (auto &Data : VFToVector) { 597 Value *V1 = Data.second.front(); 598 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { 599 SingleMax = VectorOpToIdx[V1].size() + UndefSz; 600 SingleVec = V1; 601 } 602 Value *V2 = nullptr; 603 if (Data.second.size() > 1) 604 V2 = *std::next(Data.second.begin()); 605 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + 606 UndefSz) { 607 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; 608 PairVec = std::make_pair(V1, V2); 609 } 610 } 611 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) 612 return std::nullopt; 613 // Check if better to perform a shuffle of 2 vectors or just of a single 614 // vector. 615 SmallVector<Value *> SavedVL(VL.begin(), VL.end()); 616 SmallVector<Value *> GatheredExtracts( 617 VL.size(), PoisonValue::get(VL.front()->getType())); 618 if (SingleMax >= PairMax && SingleMax) { 619 for (int Idx : VectorOpToIdx[SingleVec]) 620 std::swap(GatheredExtracts[Idx], VL[Idx]); 621 } else { 622 for (Value *V : {PairVec.first, PairVec.second}) 623 for (int Idx : VectorOpToIdx[V]) 624 std::swap(GatheredExtracts[Idx], VL[Idx]); 625 } 626 // Add extracts from undefs too. 627 for (int Idx : UndefVectorExtracts) 628 std::swap(GatheredExtracts[Idx], VL[Idx]); 629 // Check that gather of extractelements can be represented as just a 630 // shuffle of a single/two vectors the scalars are extracted from. 631 std::optional<TTI::ShuffleKind> Res = 632 isFixedVectorShuffle(GatheredExtracts, Mask); 633 if (!Res) { 634 // TODO: try to check other subsets if possible. 635 // Restore the original VL if attempt was not successful. 636 VL.swap(SavedVL); 637 return std::nullopt; 638 } 639 // Restore unused scalars from mask, if some of the extractelements were not 640 // selected for shuffle. 641 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { 642 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 643 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || 644 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || 645 is_contained(UndefVectorExtracts, I)) 646 continue; 647 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I])) 648 std::swap(VL[I], GatheredExtracts[I]); 649 } 650 return Res; 651 } 652 653 namespace { 654 655 /// Main data required for vectorization of instructions. 656 struct InstructionsState { 657 /// The very first instruction in the list with the main opcode. 658 Value *OpValue = nullptr; 659 660 /// The main/alternate instruction. 661 Instruction *MainOp = nullptr; 662 Instruction *AltOp = nullptr; 663 664 /// The main/alternate opcodes for the list of instructions. 665 unsigned getOpcode() const { 666 return MainOp ? MainOp->getOpcode() : 0; 667 } 668 669 unsigned getAltOpcode() const { 670 return AltOp ? AltOp->getOpcode() : 0; 671 } 672 673 /// Some of the instructions in the list have alternate opcodes. 674 bool isAltShuffle() const { return AltOp != MainOp; } 675 676 bool isOpcodeOrAlt(Instruction *I) const { 677 unsigned CheckedOpcode = I->getOpcode(); 678 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; 679 } 680 681 InstructionsState() = delete; 682 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) 683 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} 684 }; 685 686 } // end anonymous namespace 687 688 /// Chooses the correct key for scheduling data. If \p Op has the same (or 689 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p 690 /// OpValue. 691 static Value *isOneOf(const InstructionsState &S, Value *Op) { 692 auto *I = dyn_cast<Instruction>(Op); 693 if (I && S.isOpcodeOrAlt(I)) 694 return Op; 695 return S.OpValue; 696 } 697 698 /// \returns true if \p Opcode is allowed as part of of the main/alternate 699 /// instruction for SLP vectorization. 700 /// 701 /// Example of unsupported opcode is SDIV that can potentially cause UB if the 702 /// "shuffled out" lane would result in division by zero. 703 static bool isValidForAlternation(unsigned Opcode) { 704 if (Instruction::isIntDivRem(Opcode)) 705 return false; 706 707 return true; 708 } 709 710 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 711 const TargetLibraryInfo &TLI, 712 unsigned BaseIndex = 0); 713 714 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 715 /// compatible instructions or constants, or just some other regular values. 716 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, 717 Value *Op1, const TargetLibraryInfo &TLI) { 718 return (isConstant(BaseOp0) && isConstant(Op0)) || 719 (isConstant(BaseOp1) && isConstant(Op1)) || 720 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && 721 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || 722 BaseOp0 == Op0 || BaseOp1 == Op1 || 723 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || 724 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); 725 } 726 727 /// \returns true if a compare instruction \p CI has similar "look" and 728 /// same predicate as \p BaseCI, "as is" or with its operands and predicate 729 /// swapped, false otherwise. 730 static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, 731 const TargetLibraryInfo &TLI) { 732 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && 733 "Assessing comparisons of different types?"); 734 CmpInst::Predicate BasePred = BaseCI->getPredicate(); 735 CmpInst::Predicate Pred = CI->getPredicate(); 736 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred); 737 738 Value *BaseOp0 = BaseCI->getOperand(0); 739 Value *BaseOp1 = BaseCI->getOperand(1); 740 Value *Op0 = CI->getOperand(0); 741 Value *Op1 = CI->getOperand(1); 742 743 return (BasePred == Pred && 744 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || 745 (BasePred == SwappedPred && 746 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); 747 } 748 749 /// \returns analysis of the Instructions in \p VL described in 750 /// InstructionsState, the Opcode that we suppose the whole list 751 /// could be vectorized even if its structure is diverse. 752 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 753 const TargetLibraryInfo &TLI, 754 unsigned BaseIndex) { 755 // Make sure these are all Instructions. 756 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) 757 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 758 759 bool IsCastOp = isa<CastInst>(VL[BaseIndex]); 760 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); 761 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]); 762 CmpInst::Predicate BasePred = 763 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate() 764 : CmpInst::BAD_ICMP_PREDICATE; 765 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); 766 unsigned AltOpcode = Opcode; 767 unsigned AltIndex = BaseIndex; 768 769 // Check for one alternate opcode from another BinaryOperator. 770 // TODO - generalize to support all operators (types, calls etc.). 771 auto *IBase = cast<Instruction>(VL[BaseIndex]); 772 Intrinsic::ID BaseID = 0; 773 SmallVector<VFInfo> BaseMappings; 774 if (auto *CallBase = dyn_cast<CallInst>(IBase)) { 775 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); 776 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); 777 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) 778 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 779 } 780 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { 781 auto *I = cast<Instruction>(VL[Cnt]); 782 unsigned InstOpcode = I->getOpcode(); 783 if (IsBinOp && isa<BinaryOperator>(I)) { 784 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 785 continue; 786 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && 787 isValidForAlternation(Opcode)) { 788 AltOpcode = InstOpcode; 789 AltIndex = Cnt; 790 continue; 791 } 792 } else if (IsCastOp && isa<CastInst>(I)) { 793 Value *Op0 = IBase->getOperand(0); 794 Type *Ty0 = Op0->getType(); 795 Value *Op1 = I->getOperand(0); 796 Type *Ty1 = Op1->getType(); 797 if (Ty0 == Ty1) { 798 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 799 continue; 800 if (Opcode == AltOpcode) { 801 assert(isValidForAlternation(Opcode) && 802 isValidForAlternation(InstOpcode) && 803 "Cast isn't safe for alternation, logic needs to be updated!"); 804 AltOpcode = InstOpcode; 805 AltIndex = Cnt; 806 continue; 807 } 808 } 809 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) { 810 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]); 811 Type *Ty0 = BaseInst->getOperand(0)->getType(); 812 Type *Ty1 = Inst->getOperand(0)->getType(); 813 if (Ty0 == Ty1) { 814 assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); 815 // Check for compatible operands. If the corresponding operands are not 816 // compatible - need to perform alternate vectorization. 817 CmpInst::Predicate CurrentPred = Inst->getPredicate(); 818 CmpInst::Predicate SwappedCurrentPred = 819 CmpInst::getSwappedPredicate(CurrentPred); 820 821 if (E == 2 && 822 (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) 823 continue; 824 825 if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) 826 continue; 827 auto *AltInst = cast<CmpInst>(VL[AltIndex]); 828 if (AltIndex != BaseIndex) { 829 if (isCmpSameOrSwapped(AltInst, Inst, TLI)) 830 continue; 831 } else if (BasePred != CurrentPred) { 832 assert( 833 isValidForAlternation(InstOpcode) && 834 "CmpInst isn't safe for alternation, logic needs to be updated!"); 835 AltIndex = Cnt; 836 continue; 837 } 838 CmpInst::Predicate AltPred = AltInst->getPredicate(); 839 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || 840 AltPred == CurrentPred || AltPred == SwappedCurrentPred) 841 continue; 842 } 843 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) { 844 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 845 if (Gep->getNumOperands() != 2 || 846 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) 847 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 848 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { 849 if (!isVectorLikeInstWithConstOps(EI)) 850 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 851 } else if (auto *LI = dyn_cast<LoadInst>(I)) { 852 auto *BaseLI = cast<LoadInst>(IBase); 853 if (!LI->isSimple() || !BaseLI->isSimple()) 854 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 855 } else if (auto *Call = dyn_cast<CallInst>(I)) { 856 auto *CallBase = cast<CallInst>(IBase); 857 if (Call->getCalledFunction() != CallBase->getCalledFunction()) 858 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 859 if (Call->hasOperandBundles() && 860 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), 861 Call->op_begin() + Call->getBundleOperandsEndIndex(), 862 CallBase->op_begin() + 863 CallBase->getBundleOperandsStartIndex())) 864 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 865 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); 866 if (ID != BaseID) 867 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 868 if (!ID) { 869 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); 870 if (Mappings.size() != BaseMappings.size() || 871 Mappings.front().ISA != BaseMappings.front().ISA || 872 Mappings.front().ScalarName != BaseMappings.front().ScalarName || 873 Mappings.front().VectorName != BaseMappings.front().VectorName || 874 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || 875 Mappings.front().Shape.Parameters != 876 BaseMappings.front().Shape.Parameters) 877 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 878 } 879 } 880 continue; 881 } 882 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 883 } 884 885 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), 886 cast<Instruction>(VL[AltIndex])); 887 } 888 889 /// \returns true if all of the values in \p VL have the same type or false 890 /// otherwise. 891 static bool allSameType(ArrayRef<Value *> VL) { 892 Type *Ty = VL[0]->getType(); 893 for (int i = 1, e = VL.size(); i < e; i++) 894 if (VL[i]->getType() != Ty) 895 return false; 896 897 return true; 898 } 899 900 /// \returns True if in-tree use also needs extract. This refers to 901 /// possible scalar operand in vectorized instruction. 902 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 903 TargetLibraryInfo *TLI) { 904 unsigned Opcode = UserInst->getOpcode(); 905 switch (Opcode) { 906 case Instruction::Load: { 907 LoadInst *LI = cast<LoadInst>(UserInst); 908 return (LI->getPointerOperand() == Scalar); 909 } 910 case Instruction::Store: { 911 StoreInst *SI = cast<StoreInst>(UserInst); 912 return (SI->getPointerOperand() == Scalar); 913 } 914 case Instruction::Call: { 915 CallInst *CI = cast<CallInst>(UserInst); 916 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 917 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { 918 if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) 919 return (CI->getArgOperand(i) == Scalar); 920 } 921 [[fallthrough]]; 922 } 923 default: 924 return false; 925 } 926 } 927 928 /// \returns the AA location that is being access by the instruction. 929 static MemoryLocation getLocation(Instruction *I) { 930 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 931 return MemoryLocation::get(SI); 932 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 933 return MemoryLocation::get(LI); 934 return MemoryLocation(); 935 } 936 937 /// \returns True if the instruction is not a volatile or atomic load/store. 938 static bool isSimple(Instruction *I) { 939 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 940 return LI->isSimple(); 941 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 942 return SI->isSimple(); 943 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 944 return !MI->isVolatile(); 945 return true; 946 } 947 948 /// Shuffles \p Mask in accordance with the given \p SubMask. 949 /// \param ExtendingManyInputs Supports reshuffling of the mask with not only 950 /// one but two input vectors. 951 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, 952 bool ExtendingManyInputs = false) { 953 if (SubMask.empty()) 954 return; 955 assert( 956 (!ExtendingManyInputs || SubMask.size() > Mask.size() || 957 // Check if input scalars were extended to match the size of other node. 958 (SubMask.size() == Mask.size() && 959 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(), 960 [](int Idx) { return Idx == PoisonMaskElem; }))) && 961 "SubMask with many inputs support must be larger than the mask."); 962 if (Mask.empty()) { 963 Mask.append(SubMask.begin(), SubMask.end()); 964 return; 965 } 966 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem); 967 int TermValue = std::min(Mask.size(), SubMask.size()); 968 for (int I = 0, E = SubMask.size(); I < E; ++I) { 969 if (SubMask[I] == PoisonMaskElem || 970 (!ExtendingManyInputs && 971 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue))) 972 continue; 973 NewMask[I] = Mask[SubMask[I]]; 974 } 975 Mask.swap(NewMask); 976 } 977 978 /// Order may have elements assigned special value (size) which is out of 979 /// bounds. Such indices only appear on places which correspond to undef values 980 /// (see canReuseExtract for details) and used in order to avoid undef values 981 /// have effect on operands ordering. 982 /// The first loop below simply finds all unused indices and then the next loop 983 /// nest assigns these indices for undef values positions. 984 /// As an example below Order has two undef positions and they have assigned 985 /// values 3 and 7 respectively: 986 /// before: 6 9 5 4 9 2 1 0 987 /// after: 6 3 5 4 7 2 1 0 988 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { 989 const unsigned Sz = Order.size(); 990 SmallBitVector UnusedIndices(Sz, /*t=*/true); 991 SmallBitVector MaskedIndices(Sz); 992 for (unsigned I = 0; I < Sz; ++I) { 993 if (Order[I] < Sz) 994 UnusedIndices.reset(Order[I]); 995 else 996 MaskedIndices.set(I); 997 } 998 if (MaskedIndices.none()) 999 return; 1000 assert(UnusedIndices.count() == MaskedIndices.count() && 1001 "Non-synced masked/available indices."); 1002 int Idx = UnusedIndices.find_first(); 1003 int MIdx = MaskedIndices.find_first(); 1004 while (MIdx >= 0) { 1005 assert(Idx >= 0 && "Indices must be synced."); 1006 Order[MIdx] = Idx; 1007 Idx = UnusedIndices.find_next(Idx); 1008 MIdx = MaskedIndices.find_next(MIdx); 1009 } 1010 } 1011 1012 namespace llvm { 1013 1014 static void inversePermutation(ArrayRef<unsigned> Indices, 1015 SmallVectorImpl<int> &Mask) { 1016 Mask.clear(); 1017 const unsigned E = Indices.size(); 1018 Mask.resize(E, PoisonMaskElem); 1019 for (unsigned I = 0; I < E; ++I) 1020 Mask[Indices[I]] = I; 1021 } 1022 1023 /// Reorders the list of scalars in accordance with the given \p Mask. 1024 static void reorderScalars(SmallVectorImpl<Value *> &Scalars, 1025 ArrayRef<int> Mask) { 1026 assert(!Mask.empty() && "Expected non-empty mask."); 1027 SmallVector<Value *> Prev(Scalars.size(), 1028 UndefValue::get(Scalars.front()->getType())); 1029 Prev.swap(Scalars); 1030 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 1031 if (Mask[I] != PoisonMaskElem) 1032 Scalars[Mask[I]] = Prev[I]; 1033 } 1034 1035 /// Checks if the provided value does not require scheduling. It does not 1036 /// require scheduling if this is not an instruction or it is an instruction 1037 /// that does not read/write memory and all operands are either not instructions 1038 /// or phi nodes or instructions from different blocks. 1039 static bool areAllOperandsNonInsts(Value *V) { 1040 auto *I = dyn_cast<Instruction>(V); 1041 if (!I) 1042 return true; 1043 return !mayHaveNonDefUseDependency(*I) && 1044 all_of(I->operands(), [I](Value *V) { 1045 auto *IO = dyn_cast<Instruction>(V); 1046 if (!IO) 1047 return true; 1048 return isa<PHINode>(IO) || IO->getParent() != I->getParent(); 1049 }); 1050 } 1051 1052 /// Checks if the provided value does not require scheduling. It does not 1053 /// require scheduling if this is not an instruction or it is an instruction 1054 /// that does not read/write memory and all users are phi nodes or instructions 1055 /// from the different blocks. 1056 static bool isUsedOutsideBlock(Value *V) { 1057 auto *I = dyn_cast<Instruction>(V); 1058 if (!I) 1059 return true; 1060 // Limits the number of uses to save compile time. 1061 constexpr int UsesLimit = 8; 1062 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && 1063 all_of(I->users(), [I](User *U) { 1064 auto *IU = dyn_cast<Instruction>(U); 1065 if (!IU) 1066 return true; 1067 return IU->getParent() != I->getParent() || isa<PHINode>(IU); 1068 }); 1069 } 1070 1071 /// Checks if the specified value does not require scheduling. It does not 1072 /// require scheduling if all operands and all users do not need to be scheduled 1073 /// in the current basic block. 1074 static bool doesNotNeedToBeScheduled(Value *V) { 1075 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); 1076 } 1077 1078 /// Checks if the specified array of instructions does not require scheduling. 1079 /// It is so if all either instructions have operands that do not require 1080 /// scheduling or their users do not require scheduling since they are phis or 1081 /// in other basic blocks. 1082 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { 1083 return !VL.empty() && 1084 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); 1085 } 1086 1087 namespace slpvectorizer { 1088 1089 /// Bottom Up SLP Vectorizer. 1090 class BoUpSLP { 1091 struct TreeEntry; 1092 struct ScheduleData; 1093 class ShuffleCostEstimator; 1094 class ShuffleInstructionBuilder; 1095 1096 public: 1097 using ValueList = SmallVector<Value *, 8>; 1098 using InstrList = SmallVector<Instruction *, 16>; 1099 using ValueSet = SmallPtrSet<Value *, 16>; 1100 using StoreList = SmallVector<StoreInst *, 8>; 1101 using ExtraValueToDebugLocsMap = 1102 MapVector<Value *, SmallVector<Instruction *, 2>>; 1103 using OrdersType = SmallVector<unsigned, 4>; 1104 1105 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 1106 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, 1107 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, 1108 const DataLayout *DL, OptimizationRemarkEmitter *ORE) 1109 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), 1110 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { 1111 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 1112 // Use the vector register size specified by the target unless overridden 1113 // by a command-line option. 1114 // TODO: It would be better to limit the vectorization factor based on 1115 // data type rather than just register size. For example, x86 AVX has 1116 // 256-bit registers, but it does not support integer operations 1117 // at that width (that requires AVX2). 1118 if (MaxVectorRegSizeOption.getNumOccurrences()) 1119 MaxVecRegSize = MaxVectorRegSizeOption; 1120 else 1121 MaxVecRegSize = 1122 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 1123 .getFixedValue(); 1124 1125 if (MinVectorRegSizeOption.getNumOccurrences()) 1126 MinVecRegSize = MinVectorRegSizeOption; 1127 else 1128 MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); 1129 } 1130 1131 /// Vectorize the tree that starts with the elements in \p VL. 1132 /// Returns the vectorized root. 1133 Value *vectorizeTree(); 1134 1135 /// Vectorize the tree but with the list of externally used values \p 1136 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 1137 /// generated extractvalue instructions. 1138 /// \param ReplacedExternals containd list of replaced external values 1139 /// {scalar, replace} after emitting extractelement for external uses. 1140 Value * 1141 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, 1142 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, 1143 Instruction *ReductionRoot = nullptr); 1144 1145 /// \returns the cost incurred by unwanted spills and fills, caused by 1146 /// holding live values over call sites. 1147 InstructionCost getSpillCost() const; 1148 1149 /// \returns the vectorization cost of the subtree that starts at \p VL. 1150 /// A negative number means that this is profitable. 1151 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt); 1152 1153 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 1154 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 1155 void buildTree(ArrayRef<Value *> Roots, 1156 const SmallDenseSet<Value *> &UserIgnoreLst); 1157 1158 /// Construct a vectorizable tree that starts at \p Roots. 1159 void buildTree(ArrayRef<Value *> Roots); 1160 1161 /// Returns whether the root node has in-tree uses. 1162 bool doesRootHaveInTreeUses() const { 1163 return !VectorizableTree.empty() && 1164 !VectorizableTree.front()->UserTreeIndices.empty(); 1165 } 1166 1167 /// Return the scalars of the root node. 1168 ArrayRef<Value *> getRootNodeScalars() const { 1169 assert(!VectorizableTree.empty() && "No graph to get the first node from"); 1170 return VectorizableTree.front()->Scalars; 1171 } 1172 1173 /// Builds external uses of the vectorized scalars, i.e. the list of 1174 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 1175 /// ExternallyUsedValues contains additional list of external uses to handle 1176 /// vectorization of reductions. 1177 void 1178 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); 1179 1180 /// Clear the internal data structures that are created by 'buildTree'. 1181 void deleteTree() { 1182 VectorizableTree.clear(); 1183 ScalarToTreeEntry.clear(); 1184 MustGather.clear(); 1185 EntryToLastInstruction.clear(); 1186 ExternalUses.clear(); 1187 for (auto &Iter : BlocksSchedules) { 1188 BlockScheduling *BS = Iter.second.get(); 1189 BS->clear(); 1190 } 1191 MinBWs.clear(); 1192 InstrElementSize.clear(); 1193 UserIgnoreList = nullptr; 1194 PostponedGathers.clear(); 1195 ValueToGatherNodes.clear(); 1196 } 1197 1198 unsigned getTreeSize() const { return VectorizableTree.size(); } 1199 1200 /// Perform LICM and CSE on the newly generated gather sequences. 1201 void optimizeGatherSequence(); 1202 1203 /// Checks if the specified gather tree entry \p TE can be represented as a 1204 /// shuffled vector entry + (possibly) permutation with other gathers. It 1205 /// implements the checks only for possibly ordered scalars (Loads, 1206 /// ExtractElement, ExtractValue), which can be part of the graph. 1207 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); 1208 1209 /// Sort loads into increasing pointers offsets to allow greater clustering. 1210 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); 1211 1212 /// Gets reordering data for the given tree entry. If the entry is vectorized 1213 /// - just return ReorderIndices, otherwise check if the scalars can be 1214 /// reordered and return the most optimal order. 1215 /// \return std::nullopt if ordering is not important, empty order, if 1216 /// identity order is important, or the actual order. 1217 /// \param TopToBottom If true, include the order of vectorized stores and 1218 /// insertelement nodes, otherwise skip them. 1219 std::optional<OrdersType> getReorderingData(const TreeEntry &TE, 1220 bool TopToBottom); 1221 1222 /// Reorders the current graph to the most profitable order starting from the 1223 /// root node to the leaf nodes. The best order is chosen only from the nodes 1224 /// of the same size (vectorization factor). Smaller nodes are considered 1225 /// parts of subgraph with smaller VF and they are reordered independently. We 1226 /// can make it because we still need to extend smaller nodes to the wider VF 1227 /// and we can merge reordering shuffles with the widening shuffles. 1228 void reorderTopToBottom(); 1229 1230 /// Reorders the current graph to the most profitable order starting from 1231 /// leaves to the root. It allows to rotate small subgraphs and reduce the 1232 /// number of reshuffles if the leaf nodes use the same order. In this case we 1233 /// can merge the orders and just shuffle user node instead of shuffling its 1234 /// operands. Plus, even the leaf nodes have different orders, it allows to 1235 /// sink reordering in the graph closer to the root node and merge it later 1236 /// during analysis. 1237 void reorderBottomToTop(bool IgnoreReorder = false); 1238 1239 /// \return The vector element size in bits to use when vectorizing the 1240 /// expression tree ending at \p V. If V is a store, the size is the width of 1241 /// the stored value. Otherwise, the size is the width of the largest loaded 1242 /// value reaching V. This method is used by the vectorizer to calculate 1243 /// vectorization factors. 1244 unsigned getVectorElementSize(Value *V); 1245 1246 /// Compute the minimum type sizes required to represent the entries in a 1247 /// vectorizable tree. 1248 void computeMinimumValueSizes(); 1249 1250 // \returns maximum vector register size as set by TTI or overridden by cl::opt. 1251 unsigned getMaxVecRegSize() const { 1252 return MaxVecRegSize; 1253 } 1254 1255 // \returns minimum vector register size as set by cl::opt. 1256 unsigned getMinVecRegSize() const { 1257 return MinVecRegSize; 1258 } 1259 1260 unsigned getMinVF(unsigned Sz) const { 1261 return std::max(2U, getMinVecRegSize() / Sz); 1262 } 1263 1264 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1265 unsigned MaxVF = MaxVFOption.getNumOccurrences() ? 1266 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); 1267 return MaxVF ? MaxVF : UINT_MAX; 1268 } 1269 1270 /// Check if homogeneous aggregate is isomorphic to some VectorType. 1271 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 1272 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 1273 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 1274 /// 1275 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 1276 unsigned canMapToVector(Type *T, const DataLayout &DL) const; 1277 1278 /// \returns True if the VectorizableTree is both tiny and not fully 1279 /// vectorizable. We do not vectorize such trees. 1280 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; 1281 1282 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 1283 /// can be load combined in the backend. Load combining may not be allowed in 1284 /// the IR optimizer, so we do not want to alter the pattern. For example, 1285 /// partially transforming a scalar bswap() pattern into vector code is 1286 /// effectively impossible for the backend to undo. 1287 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1288 /// may not be necessary. 1289 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; 1290 1291 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 1292 /// can be load combined in the backend. Load combining may not be allowed in 1293 /// the IR optimizer, so we do not want to alter the pattern. For example, 1294 /// partially transforming a scalar bswap() pattern into vector code is 1295 /// effectively impossible for the backend to undo. 1296 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1297 /// may not be necessary. 1298 bool isLoadCombineCandidate() const; 1299 1300 OptimizationRemarkEmitter *getORE() { return ORE; } 1301 1302 /// This structure holds any data we need about the edges being traversed 1303 /// during buildTree_rec(). We keep track of: 1304 /// (i) the user TreeEntry index, and 1305 /// (ii) the index of the edge. 1306 struct EdgeInfo { 1307 EdgeInfo() = default; 1308 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) 1309 : UserTE(UserTE), EdgeIdx(EdgeIdx) {} 1310 /// The user TreeEntry. 1311 TreeEntry *UserTE = nullptr; 1312 /// The operand index of the use. 1313 unsigned EdgeIdx = UINT_MAX; 1314 #ifndef NDEBUG 1315 friend inline raw_ostream &operator<<(raw_ostream &OS, 1316 const BoUpSLP::EdgeInfo &EI) { 1317 EI.dump(OS); 1318 return OS; 1319 } 1320 /// Debug print. 1321 void dump(raw_ostream &OS) const { 1322 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") 1323 << " EdgeIdx:" << EdgeIdx << "}"; 1324 } 1325 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 1326 #endif 1327 }; 1328 1329 /// A helper class used for scoring candidates for two consecutive lanes. 1330 class LookAheadHeuristics { 1331 const TargetLibraryInfo &TLI; 1332 const DataLayout &DL; 1333 ScalarEvolution &SE; 1334 const BoUpSLP &R; 1335 int NumLanes; // Total number of lanes (aka vectorization factor). 1336 int MaxLevel; // The maximum recursion depth for accumulating score. 1337 1338 public: 1339 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, 1340 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, 1341 int MaxLevel) 1342 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), 1343 MaxLevel(MaxLevel) {} 1344 1345 // The hard-coded scores listed here are not very important, though it shall 1346 // be higher for better matches to improve the resulting cost. When 1347 // computing the scores of matching one sub-tree with another, we are 1348 // basically counting the number of values that are matching. So even if all 1349 // scores are set to 1, we would still get a decent matching result. 1350 // However, sometimes we have to break ties. For example we may have to 1351 // choose between matching loads vs matching opcodes. This is what these 1352 // scores are helping us with: they provide the order of preference. Also, 1353 // this is important if the scalar is externally used or used in another 1354 // tree entry node in the different lane. 1355 1356 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1357 static const int ScoreConsecutiveLoads = 4; 1358 /// The same load multiple times. This should have a better score than 1359 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it 1360 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for 1361 /// a vector load and 1.0 for a broadcast. 1362 static const int ScoreSplatLoads = 3; 1363 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1364 static const int ScoreReversedLoads = 3; 1365 /// A load candidate for masked gather. 1366 static const int ScoreMaskedGatherCandidate = 1; 1367 /// ExtractElementInst from same vector and consecutive indexes. 1368 static const int ScoreConsecutiveExtracts = 4; 1369 /// ExtractElementInst from same vector and reversed indices. 1370 static const int ScoreReversedExtracts = 3; 1371 /// Constants. 1372 static const int ScoreConstants = 2; 1373 /// Instructions with the same opcode. 1374 static const int ScoreSameOpcode = 2; 1375 /// Instructions with alt opcodes (e.g, add + sub). 1376 static const int ScoreAltOpcodes = 1; 1377 /// Identical instructions (a.k.a. splat or broadcast). 1378 static const int ScoreSplat = 1; 1379 /// Matching with an undef is preferable to failing. 1380 static const int ScoreUndef = 1; 1381 /// Score for failing to find a decent match. 1382 static const int ScoreFail = 0; 1383 /// Score if all users are vectorized. 1384 static const int ScoreAllUserVectorized = 1; 1385 1386 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1387 /// \p U1 and \p U2 are the users of \p V1 and \p V2. 1388 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1389 /// MainAltOps. 1390 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, 1391 ArrayRef<Value *> MainAltOps) const { 1392 if (!isValidElementType(V1->getType()) || 1393 !isValidElementType(V2->getType())) 1394 return LookAheadHeuristics::ScoreFail; 1395 1396 if (V1 == V2) { 1397 if (isa<LoadInst>(V1)) { 1398 // Retruns true if the users of V1 and V2 won't need to be extracted. 1399 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { 1400 // Bail out if we have too many uses to save compilation time. 1401 static constexpr unsigned Limit = 8; 1402 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit)) 1403 return false; 1404 1405 auto AllUsersVectorized = [U1, U2, this](Value *V) { 1406 return llvm::all_of(V->users(), [U1, U2, this](Value *U) { 1407 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr; 1408 }); 1409 }; 1410 return AllUsersVectorized(V1) && AllUsersVectorized(V2); 1411 }; 1412 // A broadcast of a load can be cheaper on some targets. 1413 if (R.TTI->isLegalBroadcastLoad(V1->getType(), 1414 ElementCount::getFixed(NumLanes)) && 1415 ((int)V1->getNumUses() == NumLanes || 1416 AllUsersAreInternal(V1, V2))) 1417 return LookAheadHeuristics::ScoreSplatLoads; 1418 } 1419 return LookAheadHeuristics::ScoreSplat; 1420 } 1421 1422 auto *LI1 = dyn_cast<LoadInst>(V1); 1423 auto *LI2 = dyn_cast<LoadInst>(V2); 1424 if (LI1 && LI2) { 1425 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || 1426 !LI2->isSimple()) 1427 return LookAheadHeuristics::ScoreFail; 1428 1429 std::optional<int> Dist = getPointersDiff( 1430 LI1->getType(), LI1->getPointerOperand(), LI2->getType(), 1431 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); 1432 if (!Dist || *Dist == 0) { 1433 if (getUnderlyingObject(LI1->getPointerOperand()) == 1434 getUnderlyingObject(LI2->getPointerOperand()) && 1435 R.TTI->isLegalMaskedGather( 1436 FixedVectorType::get(LI1->getType(), NumLanes), 1437 LI1->getAlign())) 1438 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1439 return LookAheadHeuristics::ScoreFail; 1440 } 1441 // The distance is too large - still may be profitable to use masked 1442 // loads/gathers. 1443 if (std::abs(*Dist) > NumLanes / 2) 1444 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1445 // This still will detect consecutive loads, but we might have "holes" 1446 // in some cases. It is ok for non-power-2 vectorization and may produce 1447 // better results. It should not affect current vectorization. 1448 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads 1449 : LookAheadHeuristics::ScoreReversedLoads; 1450 } 1451 1452 auto *C1 = dyn_cast<Constant>(V1); 1453 auto *C2 = dyn_cast<Constant>(V2); 1454 if (C1 && C2) 1455 return LookAheadHeuristics::ScoreConstants; 1456 1457 // Extracts from consecutive indexes of the same vector better score as 1458 // the extracts could be optimized away. 1459 Value *EV1; 1460 ConstantInt *Ex1Idx; 1461 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { 1462 // Undefs are always profitable for extractelements. 1463 // Compiler can easily combine poison and extractelement <non-poison> or 1464 // undef and extractelement <poison>. But combining undef + 1465 // extractelement <non-poison-but-may-produce-poison> requires some 1466 // extra operations. 1467 if (isa<UndefValue>(V2)) 1468 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all()) 1469 ? LookAheadHeuristics::ScoreConsecutiveExtracts 1470 : LookAheadHeuristics::ScoreSameOpcode; 1471 Value *EV2 = nullptr; 1472 ConstantInt *Ex2Idx = nullptr; 1473 if (match(V2, 1474 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), 1475 m_Undef())))) { 1476 // Undefs are always profitable for extractelements. 1477 if (!Ex2Idx) 1478 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1479 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType()) 1480 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1481 if (EV2 == EV1) { 1482 int Idx1 = Ex1Idx->getZExtValue(); 1483 int Idx2 = Ex2Idx->getZExtValue(); 1484 int Dist = Idx2 - Idx1; 1485 // The distance is too large - still may be profitable to use 1486 // shuffles. 1487 if (std::abs(Dist) == 0) 1488 return LookAheadHeuristics::ScoreSplat; 1489 if (std::abs(Dist) > NumLanes / 2) 1490 return LookAheadHeuristics::ScoreSameOpcode; 1491 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts 1492 : LookAheadHeuristics::ScoreReversedExtracts; 1493 } 1494 return LookAheadHeuristics::ScoreAltOpcodes; 1495 } 1496 return LookAheadHeuristics::ScoreFail; 1497 } 1498 1499 auto *I1 = dyn_cast<Instruction>(V1); 1500 auto *I2 = dyn_cast<Instruction>(V2); 1501 if (I1 && I2) { 1502 if (I1->getParent() != I2->getParent()) 1503 return LookAheadHeuristics::ScoreFail; 1504 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); 1505 Ops.push_back(I1); 1506 Ops.push_back(I2); 1507 InstructionsState S = getSameOpcode(Ops, TLI); 1508 // Note: Only consider instructions with <= 2 operands to avoid 1509 // complexity explosion. 1510 if (S.getOpcode() && 1511 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || 1512 !S.isAltShuffle()) && 1513 all_of(Ops, [&S](Value *V) { 1514 return cast<Instruction>(V)->getNumOperands() == 1515 S.MainOp->getNumOperands(); 1516 })) 1517 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes 1518 : LookAheadHeuristics::ScoreSameOpcode; 1519 } 1520 1521 if (isa<UndefValue>(V2)) 1522 return LookAheadHeuristics::ScoreUndef; 1523 1524 return LookAheadHeuristics::ScoreFail; 1525 } 1526 1527 /// Go through the operands of \p LHS and \p RHS recursively until 1528 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are 1529 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands 1530 /// of \p U1 and \p U2), except at the beginning of the recursion where 1531 /// these are set to nullptr. 1532 /// 1533 /// For example: 1534 /// \verbatim 1535 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1536 /// \ / \ / \ / \ / 1537 /// + + + + 1538 /// G1 G2 G3 G4 1539 /// \endverbatim 1540 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1541 /// each level recursively, accumulating the score. It starts from matching 1542 /// the additions at level 0, then moves on to the loads (level 1). The 1543 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1544 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while 1545 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. 1546 /// Please note that the order of the operands does not matter, as we 1547 /// evaluate the score of all profitable combinations of operands. In 1548 /// other words the score of G1 and G4 is the same as G1 and G2. This 1549 /// heuristic is based on ideas described in: 1550 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1551 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1552 /// Luís F. W. Góes 1553 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, 1554 Instruction *U2, int CurrLevel, 1555 ArrayRef<Value *> MainAltOps) const { 1556 1557 // Get the shallow score of V1 and V2. 1558 int ShallowScoreAtThisLevel = 1559 getShallowScore(LHS, RHS, U1, U2, MainAltOps); 1560 1561 // If reached MaxLevel, 1562 // or if V1 and V2 are not instructions, 1563 // or if they are SPLAT, 1564 // or if they are not consecutive, 1565 // or if profitable to vectorize loads or extractelements, early return 1566 // the current cost. 1567 auto *I1 = dyn_cast<Instruction>(LHS); 1568 auto *I2 = dyn_cast<Instruction>(RHS); 1569 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || 1570 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || 1571 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || 1572 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || 1573 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && 1574 ShallowScoreAtThisLevel)) 1575 return ShallowScoreAtThisLevel; 1576 assert(I1 && I2 && "Should have early exited."); 1577 1578 // Contains the I2 operand indexes that got matched with I1 operands. 1579 SmallSet<unsigned, 4> Op2Used; 1580 1581 // Recursion towards the operands of I1 and I2. We are trying all possible 1582 // operand pairs, and keeping track of the best score. 1583 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); 1584 OpIdx1 != NumOperands1; ++OpIdx1) { 1585 // Try to pair op1I with the best operand of I2. 1586 int MaxTmpScore = 0; 1587 unsigned MaxOpIdx2 = 0; 1588 bool FoundBest = false; 1589 // If I2 is commutative try all combinations. 1590 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; 1591 unsigned ToIdx = isCommutative(I2) 1592 ? I2->getNumOperands() 1593 : std::min(I2->getNumOperands(), OpIdx1 + 1); 1594 assert(FromIdx <= ToIdx && "Bad index"); 1595 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { 1596 // Skip operands already paired with OpIdx1. 1597 if (Op2Used.count(OpIdx2)) 1598 continue; 1599 // Recursively calculate the cost at each level 1600 int TmpScore = 1601 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), 1602 I1, I2, CurrLevel + 1, std::nullopt); 1603 // Look for the best score. 1604 if (TmpScore > LookAheadHeuristics::ScoreFail && 1605 TmpScore > MaxTmpScore) { 1606 MaxTmpScore = TmpScore; 1607 MaxOpIdx2 = OpIdx2; 1608 FoundBest = true; 1609 } 1610 } 1611 if (FoundBest) { 1612 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1613 Op2Used.insert(MaxOpIdx2); 1614 ShallowScoreAtThisLevel += MaxTmpScore; 1615 } 1616 } 1617 return ShallowScoreAtThisLevel; 1618 } 1619 }; 1620 /// A helper data structure to hold the operands of a vector of instructions. 1621 /// This supports a fixed vector length for all operand vectors. 1622 class VLOperands { 1623 /// For each operand we need (i) the value, and (ii) the opcode that it 1624 /// would be attached to if the expression was in a left-linearized form. 1625 /// This is required to avoid illegal operand reordering. 1626 /// For example: 1627 /// \verbatim 1628 /// 0 Op1 1629 /// |/ 1630 /// Op1 Op2 Linearized + Op2 1631 /// \ / ----------> |/ 1632 /// - - 1633 /// 1634 /// Op1 - Op2 (0 + Op1) - Op2 1635 /// \endverbatim 1636 /// 1637 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 1638 /// 1639 /// Another way to think of this is to track all the operations across the 1640 /// path from the operand all the way to the root of the tree and to 1641 /// calculate the operation that corresponds to this path. For example, the 1642 /// path from Op2 to the root crosses the RHS of the '-', therefore the 1643 /// corresponding operation is a '-' (which matches the one in the 1644 /// linearized tree, as shown above). 1645 /// 1646 /// For lack of a better term, we refer to this operation as Accumulated 1647 /// Path Operation (APO). 1648 struct OperandData { 1649 OperandData() = default; 1650 OperandData(Value *V, bool APO, bool IsUsed) 1651 : V(V), APO(APO), IsUsed(IsUsed) {} 1652 /// The operand value. 1653 Value *V = nullptr; 1654 /// TreeEntries only allow a single opcode, or an alternate sequence of 1655 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 1656 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 1657 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 1658 /// (e.g., Add/Mul) 1659 bool APO = false; 1660 /// Helper data for the reordering function. 1661 bool IsUsed = false; 1662 }; 1663 1664 /// During operand reordering, we are trying to select the operand at lane 1665 /// that matches best with the operand at the neighboring lane. Our 1666 /// selection is based on the type of value we are looking for. For example, 1667 /// if the neighboring lane has a load, we need to look for a load that is 1668 /// accessing a consecutive address. These strategies are summarized in the 1669 /// 'ReorderingMode' enumerator. 1670 enum class ReorderingMode { 1671 Load, ///< Matching loads to consecutive memory addresses 1672 Opcode, ///< Matching instructions based on opcode (same or alternate) 1673 Constant, ///< Matching constants 1674 Splat, ///< Matching the same instruction multiple times (broadcast) 1675 Failed, ///< We failed to create a vectorizable group 1676 }; 1677 1678 using OperandDataVec = SmallVector<OperandData, 2>; 1679 1680 /// A vector of operand vectors. 1681 SmallVector<OperandDataVec, 4> OpsVec; 1682 1683 const TargetLibraryInfo &TLI; 1684 const DataLayout &DL; 1685 ScalarEvolution &SE; 1686 const BoUpSLP &R; 1687 1688 /// \returns the operand data at \p OpIdx and \p Lane. 1689 OperandData &getData(unsigned OpIdx, unsigned Lane) { 1690 return OpsVec[OpIdx][Lane]; 1691 } 1692 1693 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 1694 const OperandData &getData(unsigned OpIdx, unsigned Lane) const { 1695 return OpsVec[OpIdx][Lane]; 1696 } 1697 1698 /// Clears the used flag for all entries. 1699 void clearUsed() { 1700 for (unsigned OpIdx = 0, NumOperands = getNumOperands(); 1701 OpIdx != NumOperands; ++OpIdx) 1702 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 1703 ++Lane) 1704 OpsVec[OpIdx][Lane].IsUsed = false; 1705 } 1706 1707 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 1708 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { 1709 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); 1710 } 1711 1712 /// \param Lane lane of the operands under analysis. 1713 /// \param OpIdx operand index in \p Lane lane we're looking the best 1714 /// candidate for. 1715 /// \param Idx operand index of the current candidate value. 1716 /// \returns The additional score due to possible broadcasting of the 1717 /// elements in the lane. It is more profitable to have power-of-2 unique 1718 /// elements in the lane, it will be vectorized with higher probability 1719 /// after removing duplicates. Currently the SLP vectorizer supports only 1720 /// vectorization of the power-of-2 number of unique scalars. 1721 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1722 Value *IdxLaneV = getData(Idx, Lane).V; 1723 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) 1724 return 0; 1725 SmallPtrSet<Value *, 4> Uniques; 1726 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { 1727 if (Ln == Lane) 1728 continue; 1729 Value *OpIdxLnV = getData(OpIdx, Ln).V; 1730 if (!isa<Instruction>(OpIdxLnV)) 1731 return 0; 1732 Uniques.insert(OpIdxLnV); 1733 } 1734 int UniquesCount = Uniques.size(); 1735 int UniquesCntWithIdxLaneV = 1736 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; 1737 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1738 int UniquesCntWithOpIdxLaneV = 1739 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; 1740 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) 1741 return 0; 1742 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - 1743 UniquesCntWithOpIdxLaneV) - 1744 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); 1745 } 1746 1747 /// \param Lane lane of the operands under analysis. 1748 /// \param OpIdx operand index in \p Lane lane we're looking the best 1749 /// candidate for. 1750 /// \param Idx operand index of the current candidate value. 1751 /// \returns The additional score for the scalar which users are all 1752 /// vectorized. 1753 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1754 Value *IdxLaneV = getData(Idx, Lane).V; 1755 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1756 // Do not care about number of uses for vector-like instructions 1757 // (extractelement/extractvalue with constant indices), they are extracts 1758 // themselves and already externally used. Vectorization of such 1759 // instructions does not add extra extractelement instruction, just may 1760 // remove it. 1761 if (isVectorLikeInstWithConstOps(IdxLaneV) && 1762 isVectorLikeInstWithConstOps(OpIdxLaneV)) 1763 return LookAheadHeuristics::ScoreAllUserVectorized; 1764 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); 1765 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) 1766 return 0; 1767 return R.areAllUsersVectorized(IdxLaneI, std::nullopt) 1768 ? LookAheadHeuristics::ScoreAllUserVectorized 1769 : 0; 1770 } 1771 1772 /// Score scaling factor for fully compatible instructions but with 1773 /// different number of external uses. Allows better selection of the 1774 /// instructions with less external uses. 1775 static const int ScoreScaleFactor = 10; 1776 1777 /// \Returns the look-ahead score, which tells us how much the sub-trees 1778 /// rooted at \p LHS and \p RHS match, the more they match the higher the 1779 /// score. This helps break ties in an informed way when we cannot decide on 1780 /// the order of the operands by just considering the immediate 1781 /// predecessors. 1782 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, 1783 int Lane, unsigned OpIdx, unsigned Idx, 1784 bool &IsUsed) { 1785 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), 1786 LookAheadMaxDepth); 1787 // Keep track of the instruction stack as we recurse into the operands 1788 // during the look-ahead score exploration. 1789 int Score = 1790 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, 1791 /*CurrLevel=*/1, MainAltOps); 1792 if (Score) { 1793 int SplatScore = getSplatScore(Lane, OpIdx, Idx); 1794 if (Score <= -SplatScore) { 1795 // Set the minimum score for splat-like sequence to avoid setting 1796 // failed state. 1797 Score = 1; 1798 } else { 1799 Score += SplatScore; 1800 // Scale score to see the difference between different operands 1801 // and similar operands but all vectorized/not all vectorized 1802 // uses. It does not affect actual selection of the best 1803 // compatible operand in general, just allows to select the 1804 // operand with all vectorized uses. 1805 Score *= ScoreScaleFactor; 1806 Score += getExternalUseScore(Lane, OpIdx, Idx); 1807 IsUsed = true; 1808 } 1809 } 1810 return Score; 1811 } 1812 1813 /// Best defined scores per lanes between the passes. Used to choose the 1814 /// best operand (with the highest score) between the passes. 1815 /// The key - {Operand Index, Lane}. 1816 /// The value - the best score between the passes for the lane and the 1817 /// operand. 1818 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> 1819 BestScoresPerLanes; 1820 1821 // Search all operands in Ops[*][Lane] for the one that matches best 1822 // Ops[OpIdx][LastLane] and return its opreand index. 1823 // If no good match can be found, return std::nullopt. 1824 std::optional<unsigned> 1825 getBestOperand(unsigned OpIdx, int Lane, int LastLane, 1826 ArrayRef<ReorderingMode> ReorderingModes, 1827 ArrayRef<Value *> MainAltOps) { 1828 unsigned NumOperands = getNumOperands(); 1829 1830 // The operand of the previous lane at OpIdx. 1831 Value *OpLastLane = getData(OpIdx, LastLane).V; 1832 1833 // Our strategy mode for OpIdx. 1834 ReorderingMode RMode = ReorderingModes[OpIdx]; 1835 if (RMode == ReorderingMode::Failed) 1836 return std::nullopt; 1837 1838 // The linearized opcode of the operand at OpIdx, Lane. 1839 bool OpIdxAPO = getData(OpIdx, Lane).APO; 1840 1841 // The best operand index and its score. 1842 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we 1843 // are using the score to differentiate between the two. 1844 struct BestOpData { 1845 std::optional<unsigned> Idx; 1846 unsigned Score = 0; 1847 } BestOp; 1848 BestOp.Score = 1849 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) 1850 .first->second; 1851 1852 // Track if the operand must be marked as used. If the operand is set to 1853 // Score 1 explicitly (because of non power-of-2 unique scalars, we may 1854 // want to reestimate the operands again on the following iterations). 1855 bool IsUsed = 1856 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant; 1857 // Iterate through all unused operands and look for the best. 1858 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { 1859 // Get the operand at Idx and Lane. 1860 OperandData &OpData = getData(Idx, Lane); 1861 Value *Op = OpData.V; 1862 bool OpAPO = OpData.APO; 1863 1864 // Skip already selected operands. 1865 if (OpData.IsUsed) 1866 continue; 1867 1868 // Skip if we are trying to move the operand to a position with a 1869 // different opcode in the linearized tree form. This would break the 1870 // semantics. 1871 if (OpAPO != OpIdxAPO) 1872 continue; 1873 1874 // Look for an operand that matches the current mode. 1875 switch (RMode) { 1876 case ReorderingMode::Load: 1877 case ReorderingMode::Constant: 1878 case ReorderingMode::Opcode: { 1879 bool LeftToRight = Lane > LastLane; 1880 Value *OpLeft = (LeftToRight) ? OpLastLane : Op; 1881 Value *OpRight = (LeftToRight) ? Op : OpLastLane; 1882 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, 1883 OpIdx, Idx, IsUsed); 1884 if (Score > static_cast<int>(BestOp.Score)) { 1885 BestOp.Idx = Idx; 1886 BestOp.Score = Score; 1887 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; 1888 } 1889 break; 1890 } 1891 case ReorderingMode::Splat: 1892 if (Op == OpLastLane) 1893 BestOp.Idx = Idx; 1894 break; 1895 case ReorderingMode::Failed: 1896 llvm_unreachable("Not expected Failed reordering mode."); 1897 } 1898 } 1899 1900 if (BestOp.Idx) { 1901 getData(*BestOp.Idx, Lane).IsUsed = IsUsed; 1902 return BestOp.Idx; 1903 } 1904 // If we could not find a good match return std::nullopt. 1905 return std::nullopt; 1906 } 1907 1908 /// Helper for reorderOperandVecs. 1909 /// \returns the lane that we should start reordering from. This is the one 1910 /// which has the least number of operands that can freely move about or 1911 /// less profitable because it already has the most optimal set of operands. 1912 unsigned getBestLaneToStartReordering() const { 1913 unsigned Min = UINT_MAX; 1914 unsigned SameOpNumber = 0; 1915 // std::pair<unsigned, unsigned> is used to implement a simple voting 1916 // algorithm and choose the lane with the least number of operands that 1917 // can freely move about or less profitable because it already has the 1918 // most optimal set of operands. The first unsigned is a counter for 1919 // voting, the second unsigned is the counter of lanes with instructions 1920 // with same/alternate opcodes and same parent basic block. 1921 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; 1922 // Try to be closer to the original results, if we have multiple lanes 1923 // with same cost. If 2 lanes have the same cost, use the one with the 1924 // lowest index. 1925 for (int I = getNumLanes(); I > 0; --I) { 1926 unsigned Lane = I - 1; 1927 OperandsOrderData NumFreeOpsHash = 1928 getMaxNumOperandsThatCanBeReordered(Lane); 1929 // Compare the number of operands that can move and choose the one with 1930 // the least number. 1931 if (NumFreeOpsHash.NumOfAPOs < Min) { 1932 Min = NumFreeOpsHash.NumOfAPOs; 1933 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 1934 HashMap.clear(); 1935 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1936 } else if (NumFreeOpsHash.NumOfAPOs == Min && 1937 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { 1938 // Select the most optimal lane in terms of number of operands that 1939 // should be moved around. 1940 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1942 } else if (NumFreeOpsHash.NumOfAPOs == Min && 1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { 1944 auto It = HashMap.find(NumFreeOpsHash.Hash); 1945 if (It == HashMap.end()) 1946 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1947 else 1948 ++It->second.first; 1949 } 1950 } 1951 // Select the lane with the minimum counter. 1952 unsigned BestLane = 0; 1953 unsigned CntMin = UINT_MAX; 1954 for (const auto &Data : reverse(HashMap)) { 1955 if (Data.second.first < CntMin) { 1956 CntMin = Data.second.first; 1957 BestLane = Data.second.second; 1958 } 1959 } 1960 return BestLane; 1961 } 1962 1963 /// Data structure that helps to reorder operands. 1964 struct OperandsOrderData { 1965 /// The best number of operands with the same APOs, which can be 1966 /// reordered. 1967 unsigned NumOfAPOs = UINT_MAX; 1968 /// Number of operands with the same/alternate instruction opcode and 1969 /// parent. 1970 unsigned NumOpsWithSameOpcodeParent = 0; 1971 /// Hash for the actual operands ordering. 1972 /// Used to count operands, actually their position id and opcode 1973 /// value. It is used in the voting mechanism to find the lane with the 1974 /// least number of operands that can freely move about or less profitable 1975 /// because it already has the most optimal set of operands. Can be 1976 /// replaced with SmallVector<unsigned> instead but hash code is faster 1977 /// and requires less memory. 1978 unsigned Hash = 0; 1979 }; 1980 /// \returns the maximum number of operands that are allowed to be reordered 1981 /// for \p Lane and the number of compatible instructions(with the same 1982 /// parent/opcode). This is used as a heuristic for selecting the first lane 1983 /// to start operand reordering. 1984 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { 1985 unsigned CntTrue = 0; 1986 unsigned NumOperands = getNumOperands(); 1987 // Operands with the same APO can be reordered. We therefore need to count 1988 // how many of them we have for each APO, like this: Cnt[APO] = x. 1989 // Since we only have two APOs, namely true and false, we can avoid using 1990 // a map. Instead we can simply count the number of operands that 1991 // correspond to one of them (in this case the 'true' APO), and calculate 1992 // the other by subtracting it from the total number of operands. 1993 // Operands with the same instruction opcode and parent are more 1994 // profitable since we don't need to move them in many cases, with a high 1995 // probability such lane already can be vectorized effectively. 1996 bool AllUndefs = true; 1997 unsigned NumOpsWithSameOpcodeParent = 0; 1998 Instruction *OpcodeI = nullptr; 1999 BasicBlock *Parent = nullptr; 2000 unsigned Hash = 0; 2001 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2002 const OperandData &OpData = getData(OpIdx, Lane); 2003 if (OpData.APO) 2004 ++CntTrue; 2005 // Use Boyer-Moore majority voting for finding the majority opcode and 2006 // the number of times it occurs. 2007 if (auto *I = dyn_cast<Instruction>(OpData.V)) { 2008 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || 2009 I->getParent() != Parent) { 2010 if (NumOpsWithSameOpcodeParent == 0) { 2011 NumOpsWithSameOpcodeParent = 1; 2012 OpcodeI = I; 2013 Parent = I->getParent(); 2014 } else { 2015 --NumOpsWithSameOpcodeParent; 2016 } 2017 } else { 2018 ++NumOpsWithSameOpcodeParent; 2019 } 2020 } 2021 Hash = hash_combine( 2022 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); 2023 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); 2024 } 2025 if (AllUndefs) 2026 return {}; 2027 OperandsOrderData Data; 2028 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); 2029 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; 2030 Data.Hash = Hash; 2031 return Data; 2032 } 2033 2034 /// Go through the instructions in VL and append their operands. 2035 void appendOperandsOfVL(ArrayRef<Value *> VL) { 2036 assert(!VL.empty() && "Bad VL"); 2037 assert((empty() || VL.size() == getNumLanes()) && 2038 "Expected same number of lanes"); 2039 assert(isa<Instruction>(VL[0]) && "Expected instruction"); 2040 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); 2041 OpsVec.resize(NumOperands); 2042 unsigned NumLanes = VL.size(); 2043 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2044 OpsVec[OpIdx].resize(NumLanes); 2045 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 2046 assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); 2047 // Our tree has just 3 nodes: the root and two operands. 2048 // It is therefore trivial to get the APO. We only need to check the 2049 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 2050 // RHS operand. The LHS operand of both add and sub is never attached 2051 // to an inversese operation in the linearized form, therefore its APO 2052 // is false. The RHS is true only if VL[Lane] is an inverse operation. 2053 2054 // Since operand reordering is performed on groups of commutative 2055 // operations or alternating sequences (e.g., +, -), we can safely 2056 // tell the inverse operations by checking commutativity. 2057 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); 2058 bool APO = (OpIdx == 0) ? false : IsInverseOperation; 2059 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), 2060 APO, false}; 2061 } 2062 } 2063 } 2064 2065 /// \returns the number of operands. 2066 unsigned getNumOperands() const { return OpsVec.size(); } 2067 2068 /// \returns the number of lanes. 2069 unsigned getNumLanes() const { return OpsVec[0].size(); } 2070 2071 /// \returns the operand value at \p OpIdx and \p Lane. 2072 Value *getValue(unsigned OpIdx, unsigned Lane) const { 2073 return getData(OpIdx, Lane).V; 2074 } 2075 2076 /// \returns true if the data structure is empty. 2077 bool empty() const { return OpsVec.empty(); } 2078 2079 /// Clears the data. 2080 void clear() { OpsVec.clear(); } 2081 2082 /// \Returns true if there are enough operands identical to \p Op to fill 2083 /// the whole vector. 2084 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 2085 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { 2086 bool OpAPO = getData(OpIdx, Lane).APO; 2087 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 2088 if (Ln == Lane) 2089 continue; 2090 // This is set to true if we found a candidate for broadcast at Lane. 2091 bool FoundCandidate = false; 2092 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { 2093 OperandData &Data = getData(OpI, Ln); 2094 if (Data.APO != OpAPO || Data.IsUsed) 2095 continue; 2096 if (Data.V == Op) { 2097 FoundCandidate = true; 2098 Data.IsUsed = true; 2099 break; 2100 } 2101 } 2102 if (!FoundCandidate) 2103 return false; 2104 } 2105 return true; 2106 } 2107 2108 public: 2109 /// Initialize with all the operands of the instruction vector \p RootVL. 2110 VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI, 2111 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) 2112 : TLI(TLI), DL(DL), SE(SE), R(R) { 2113 // Append all the operands of RootVL. 2114 appendOperandsOfVL(RootVL); 2115 } 2116 2117 /// \Returns a value vector with the operands across all lanes for the 2118 /// opearnd at \p OpIdx. 2119 ValueList getVL(unsigned OpIdx) const { 2120 ValueList OpVL(OpsVec[OpIdx].size()); 2121 assert(OpsVec[OpIdx].size() == getNumLanes() && 2122 "Expected same num of lanes across all operands"); 2123 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) 2124 OpVL[Lane] = OpsVec[OpIdx][Lane].V; 2125 return OpVL; 2126 } 2127 2128 // Performs operand reordering for 2 or more operands. 2129 // The original operands are in OrigOps[OpIdx][Lane]. 2130 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 2131 void reorder() { 2132 unsigned NumOperands = getNumOperands(); 2133 unsigned NumLanes = getNumLanes(); 2134 // Each operand has its own mode. We are using this mode to help us select 2135 // the instructions for each lane, so that they match best with the ones 2136 // we have selected so far. 2137 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); 2138 2139 // This is a greedy single-pass algorithm. We are going over each lane 2140 // once and deciding on the best order right away with no back-tracking. 2141 // However, in order to increase its effectiveness, we start with the lane 2142 // that has operands that can move the least. For example, given the 2143 // following lanes: 2144 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 2145 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st 2146 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 2147 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th 2148 // we will start at Lane 1, since the operands of the subtraction cannot 2149 // be reordered. Then we will visit the rest of the lanes in a circular 2150 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 2151 2152 // Find the first lane that we will start our search from. 2153 unsigned FirstLane = getBestLaneToStartReordering(); 2154 2155 // Initialize the modes. 2156 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2157 Value *OpLane0 = getValue(OpIdx, FirstLane); 2158 // Keep track if we have instructions with all the same opcode on one 2159 // side. 2160 if (isa<LoadInst>(OpLane0)) 2161 ReorderingModes[OpIdx] = ReorderingMode::Load; 2162 else if (isa<Instruction>(OpLane0)) { 2163 // Check if OpLane0 should be broadcast. 2164 if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) 2165 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2166 else 2167 ReorderingModes[OpIdx] = ReorderingMode::Opcode; 2168 } 2169 else if (isa<Constant>(OpLane0)) 2170 ReorderingModes[OpIdx] = ReorderingMode::Constant; 2171 else if (isa<Argument>(OpLane0)) 2172 // Our best hope is a Splat. It may save some cost in some cases. 2173 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2174 else 2175 // NOTE: This should be unreachable. 2176 ReorderingModes[OpIdx] = ReorderingMode::Failed; 2177 } 2178 2179 // Check that we don't have same operands. No need to reorder if operands 2180 // are just perfect diamond or shuffled diamond match. Do not do it only 2181 // for possible broadcasts or non-power of 2 number of scalars (just for 2182 // now). 2183 auto &&SkipReordering = [this]() { 2184 SmallPtrSet<Value *, 4> UniqueValues; 2185 ArrayRef<OperandData> Op0 = OpsVec.front(); 2186 for (const OperandData &Data : Op0) 2187 UniqueValues.insert(Data.V); 2188 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { 2189 if (any_of(Op, [&UniqueValues](const OperandData &Data) { 2190 return !UniqueValues.contains(Data.V); 2191 })) 2192 return false; 2193 } 2194 // TODO: Check if we can remove a check for non-power-2 number of 2195 // scalars after full support of non-power-2 vectorization. 2196 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); 2197 }; 2198 2199 // If the initial strategy fails for any of the operand indexes, then we 2200 // perform reordering again in a second pass. This helps avoid assigning 2201 // high priority to the failed strategy, and should improve reordering for 2202 // the non-failed operand indexes. 2203 for (int Pass = 0; Pass != 2; ++Pass) { 2204 // Check if no need to reorder operands since they're are perfect or 2205 // shuffled diamond match. 2206 // Need to to do it to avoid extra external use cost counting for 2207 // shuffled matches, which may cause regressions. 2208 if (SkipReordering()) 2209 break; 2210 // Skip the second pass if the first pass did not fail. 2211 bool StrategyFailed = false; 2212 // Mark all operand data as free to use. 2213 clearUsed(); 2214 // We keep the original operand order for the FirstLane, so reorder the 2215 // rest of the lanes. We are visiting the nodes in a circular fashion, 2216 // using FirstLane as the center point and increasing the radius 2217 // distance. 2218 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); 2219 for (unsigned I = 0; I < NumOperands; ++I) 2220 MainAltOps[I].push_back(getData(I, FirstLane).V); 2221 2222 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { 2223 // Visit the lane on the right and then the lane on the left. 2224 for (int Direction : {+1, -1}) { 2225 int Lane = FirstLane + Direction * Distance; 2226 if (Lane < 0 || Lane >= (int)NumLanes) 2227 continue; 2228 int LastLane = Lane - Direction; 2229 assert(LastLane >= 0 && LastLane < (int)NumLanes && 2230 "Out of bounds"); 2231 // Look for a good match for each operand. 2232 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2233 // Search for the operand that matches SortedOps[OpIdx][Lane-1]. 2234 std::optional<unsigned> BestIdx = getBestOperand( 2235 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); 2236 // By not selecting a value, we allow the operands that follow to 2237 // select a better matching value. We will get a non-null value in 2238 // the next run of getBestOperand(). 2239 if (BestIdx) { 2240 // Swap the current operand with the one returned by 2241 // getBestOperand(). 2242 swap(OpIdx, *BestIdx, Lane); 2243 } else { 2244 // We failed to find a best operand, set mode to 'Failed'. 2245 ReorderingModes[OpIdx] = ReorderingMode::Failed; 2246 // Enable the second pass. 2247 StrategyFailed = true; 2248 } 2249 // Try to get the alternate opcode and follow it during analysis. 2250 if (MainAltOps[OpIdx].size() != 2) { 2251 OperandData &AltOp = getData(OpIdx, Lane); 2252 InstructionsState OpS = 2253 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); 2254 if (OpS.getOpcode() && OpS.isAltShuffle()) 2255 MainAltOps[OpIdx].push_back(AltOp.V); 2256 } 2257 } 2258 } 2259 } 2260 // Skip second pass if the strategy did not fail. 2261 if (!StrategyFailed) 2262 break; 2263 } 2264 } 2265 2266 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2267 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { 2268 switch (RMode) { 2269 case ReorderingMode::Load: 2270 return "Load"; 2271 case ReorderingMode::Opcode: 2272 return "Opcode"; 2273 case ReorderingMode::Constant: 2274 return "Constant"; 2275 case ReorderingMode::Splat: 2276 return "Splat"; 2277 case ReorderingMode::Failed: 2278 return "Failed"; 2279 } 2280 llvm_unreachable("Unimplemented Reordering Type"); 2281 } 2282 2283 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, 2284 raw_ostream &OS) { 2285 return OS << getModeStr(RMode); 2286 } 2287 2288 /// Debug print. 2289 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { 2290 printMode(RMode, dbgs()); 2291 } 2292 2293 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { 2294 return printMode(RMode, OS); 2295 } 2296 2297 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { 2298 const unsigned Indent = 2; 2299 unsigned Cnt = 0; 2300 for (const OperandDataVec &OpDataVec : OpsVec) { 2301 OS << "Operand " << Cnt++ << "\n"; 2302 for (const OperandData &OpData : OpDataVec) { 2303 OS.indent(Indent) << "{"; 2304 if (Value *V = OpData.V) 2305 OS << *V; 2306 else 2307 OS << "null"; 2308 OS << ", APO:" << OpData.APO << "}\n"; 2309 } 2310 OS << "\n"; 2311 } 2312 return OS; 2313 } 2314 2315 /// Debug print. 2316 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 2317 #endif 2318 }; 2319 2320 /// Evaluate each pair in \p Candidates and return index into \p Candidates 2321 /// for a pair which have highest score deemed to have best chance to form 2322 /// root of profitable tree to vectorize. Return std::nullopt if no candidate 2323 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit 2324 /// of the cost, considered to be good enough score. 2325 std::optional<int> 2326 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, 2327 int Limit = LookAheadHeuristics::ScoreFail) { 2328 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, 2329 RootLookAheadMaxDepth); 2330 int BestScore = Limit; 2331 std::optional<int> Index; 2332 for (int I : seq<int>(0, Candidates.size())) { 2333 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, 2334 Candidates[I].second, 2335 /*U1=*/nullptr, /*U2=*/nullptr, 2336 /*Level=*/1, std::nullopt); 2337 if (Score > BestScore) { 2338 BestScore = Score; 2339 Index = I; 2340 } 2341 } 2342 return Index; 2343 } 2344 2345 /// Checks if the instruction is marked for deletion. 2346 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } 2347 2348 /// Removes an instruction from its block and eventually deletes it. 2349 /// It's like Instruction::eraseFromParent() except that the actual deletion 2350 /// is delayed until BoUpSLP is destructed. 2351 void eraseInstruction(Instruction *I) { 2352 DeletedInstructions.insert(I); 2353 } 2354 2355 /// Checks if the instruction was already analyzed for being possible 2356 /// reduction root. 2357 bool isAnalyzedReductionRoot(Instruction *I) const { 2358 return AnalyzedReductionsRoots.count(I); 2359 } 2360 /// Register given instruction as already analyzed for being possible 2361 /// reduction root. 2362 void analyzedReductionRoot(Instruction *I) { 2363 AnalyzedReductionsRoots.insert(I); 2364 } 2365 /// Checks if the provided list of reduced values was checked already for 2366 /// vectorization. 2367 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const { 2368 return AnalyzedReductionVals.contains(hash_value(VL)); 2369 } 2370 /// Adds the list of reduced values to list of already checked values for the 2371 /// vectorization. 2372 void analyzedReductionVals(ArrayRef<Value *> VL) { 2373 AnalyzedReductionVals.insert(hash_value(VL)); 2374 } 2375 /// Clear the list of the analyzed reduction root instructions. 2376 void clearReductionData() { 2377 AnalyzedReductionsRoots.clear(); 2378 AnalyzedReductionVals.clear(); 2379 } 2380 /// Checks if the given value is gathered in one of the nodes. 2381 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const { 2382 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); 2383 } 2384 2385 /// Check if the value is vectorized in the tree. 2386 bool isVectorized(Value *V) const { return getTreeEntry(V); } 2387 2388 ~BoUpSLP(); 2389 2390 private: 2391 /// Check if the operands on the edges \p Edges of the \p UserTE allows 2392 /// reordering (i.e. the operands can be reordered because they have only one 2393 /// user and reordarable). 2394 /// \param ReorderableGathers List of all gather nodes that require reordering 2395 /// (e.g., gather of extractlements or partially vectorizable loads). 2396 /// \param GatherOps List of gather operand nodes for \p UserTE that require 2397 /// reordering, subset of \p NonVectorized. 2398 bool 2399 canReorderOperands(TreeEntry *UserTE, 2400 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 2401 ArrayRef<TreeEntry *> ReorderableGathers, 2402 SmallVectorImpl<TreeEntry *> &GatherOps); 2403 2404 /// Checks if the given \p TE is a gather node with clustered reused scalars 2405 /// and reorders it per given \p Mask. 2406 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const; 2407 2408 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2409 /// if any. If it is not vectorized (gather node), returns nullptr. 2410 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { 2411 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); 2412 TreeEntry *TE = nullptr; 2413 const auto *It = find_if(VL, [this, &TE](Value *V) { 2414 TE = getTreeEntry(V); 2415 return TE; 2416 }); 2417 if (It != VL.end() && TE->isSame(VL)) 2418 return TE; 2419 return nullptr; 2420 } 2421 2422 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2423 /// if any. If it is not vectorized (gather node), returns nullptr. 2424 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, 2425 unsigned OpIdx) const { 2426 return const_cast<BoUpSLP *>(this)->getVectorizedOperand( 2427 const_cast<TreeEntry *>(UserTE), OpIdx); 2428 } 2429 2430 /// Checks if all users of \p I are the part of the vectorization tree. 2431 bool areAllUsersVectorized(Instruction *I, 2432 ArrayRef<Value *> VectorizedVals) const; 2433 2434 /// Return information about the vector formed for the specified index 2435 /// of a vector of (the same) instruction. 2436 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL, 2437 unsigned OpIdx); 2438 2439 /// \returns the cost of the vectorizable entry. 2440 InstructionCost getEntryCost(const TreeEntry *E, 2441 ArrayRef<Value *> VectorizedVals, 2442 SmallPtrSetImpl<Value *> &CheckedExtracts); 2443 2444 /// This is the recursive part of buildTree. 2445 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, 2446 const EdgeInfo &EI); 2447 2448 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 2449 /// be vectorized to use the original vector (or aggregate "bitcast" to a 2450 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 2451 /// returns false, setting \p CurrentOrder to either an empty vector or a 2452 /// non-identity permutation that allows to reuse extract instructions. 2453 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 2454 SmallVectorImpl<unsigned> &CurrentOrder) const; 2455 2456 /// Vectorize a single entry in the tree. 2457 Value *vectorizeTree(TreeEntry *E); 2458 2459 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry 2460 /// \p E. 2461 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); 2462 2463 /// Create a new vector from a list of scalar values. Produces a sequence 2464 /// which exploits values reused across lanes, and arranges the inserts 2465 /// for ease of later optimization. 2466 template <typename BVTy, typename ResTy, typename... Args> 2467 ResTy processBuildVector(const TreeEntry *E, Args &...Params); 2468 2469 /// Create a new vector from a list of scalar values. Produces a sequence 2470 /// which exploits values reused across lanes, and arranges the inserts 2471 /// for ease of later optimization. 2472 Value *createBuildVector(const TreeEntry *E); 2473 2474 /// Returns the instruction in the bundle, which can be used as a base point 2475 /// for scheduling. Usually it is the last instruction in the bundle, except 2476 /// for the case when all operands are external (in this case, it is the first 2477 /// instruction in the list). 2478 Instruction &getLastInstructionInBundle(const TreeEntry *E); 2479 2480 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous 2481 /// tree entries. 2482 /// \param TE Tree entry checked for permutation. 2483 /// \param VL List of scalars (a subset of the TE scalar), checked for 2484 /// permutations. 2485 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 2486 /// previous tree entries. \p Mask is filled with the shuffle mask. 2487 std::optional<TargetTransformInfo::ShuffleKind> 2488 isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, 2489 SmallVectorImpl<int> &Mask, 2490 SmallVectorImpl<const TreeEntry *> &Entries); 2491 2492 /// \returns the scalarization cost for this list of values. Assuming that 2493 /// this subtree gets vectorized, we may need to extract the values from the 2494 /// roots. This method calculates the cost of extracting the values. 2495 /// \param ForPoisonSrc true if initial vector is poison, false otherwise. 2496 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const; 2497 2498 /// Set the Builder insert point to one after the last instruction in 2499 /// the bundle 2500 void setInsertPointAfterBundle(const TreeEntry *E); 2501 2502 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not 2503 /// specified, the starting vector value is poison. 2504 Value *gather(ArrayRef<Value *> VL, Value *Root); 2505 2506 /// \returns whether the VectorizableTree is fully vectorizable and will 2507 /// be beneficial even the tree height is tiny. 2508 bool isFullyVectorizableTinyTree(bool ForReduction) const; 2509 2510 /// Reorder commutative or alt operands to get better probability of 2511 /// generating vectorized code. 2512 static void reorderInputsAccordingToOpcode( 2513 ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, 2514 SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI, 2515 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); 2516 2517 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the 2518 /// users of \p TE and collects the stores. It returns the map from the store 2519 /// pointers to the collected stores. 2520 DenseMap<Value *, SmallVector<StoreInst *, 4>> 2521 collectUserStores(const BoUpSLP::TreeEntry *TE) const; 2522 2523 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the 2524 /// stores in \p StoresVec can form a vector instruction. If so it returns true 2525 /// and populates \p ReorderIndices with the shuffle indices of the the stores 2526 /// when compared to the sorted vector. 2527 bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, 2528 OrdersType &ReorderIndices) const; 2529 2530 /// Iterates through the users of \p TE, looking for scalar stores that can be 2531 /// potentially vectorized in a future SLP-tree. If found, it keeps track of 2532 /// their order and builds an order index vector for each store bundle. It 2533 /// returns all these order vectors found. 2534 /// We run this after the tree has formed, otherwise we may come across user 2535 /// instructions that are not yet in the tree. 2536 SmallVector<OrdersType, 1> 2537 findExternalStoreUsersReorderIndices(TreeEntry *TE) const; 2538 2539 struct TreeEntry { 2540 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; 2541 TreeEntry(VecTreeTy &Container) : Container(Container) {} 2542 2543 /// \returns Common mask for reorder indices and reused scalars. 2544 SmallVector<int> getCommonMask() const { 2545 SmallVector<int> Mask; 2546 inversePermutation(ReorderIndices, Mask); 2547 ::addMask(Mask, ReuseShuffleIndices); 2548 return Mask; 2549 } 2550 2551 /// \returns true if the scalars in VL are equal to this entry. 2552 bool isSame(ArrayRef<Value *> VL) const { 2553 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { 2554 if (Mask.size() != VL.size() && VL.size() == Scalars.size()) 2555 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 2556 return VL.size() == Mask.size() && 2557 std::equal(VL.begin(), VL.end(), Mask.begin(), 2558 [Scalars](Value *V, int Idx) { 2559 return (isa<UndefValue>(V) && 2560 Idx == PoisonMaskElem) || 2561 (Idx != PoisonMaskElem && V == Scalars[Idx]); 2562 }); 2563 }; 2564 if (!ReorderIndices.empty()) { 2565 // TODO: implement matching if the nodes are just reordered, still can 2566 // treat the vector as the same if the list of scalars matches VL 2567 // directly, without reordering. 2568 SmallVector<int> Mask; 2569 inversePermutation(ReorderIndices, Mask); 2570 if (VL.size() == Scalars.size()) 2571 return IsSame(Scalars, Mask); 2572 if (VL.size() == ReuseShuffleIndices.size()) { 2573 ::addMask(Mask, ReuseShuffleIndices); 2574 return IsSame(Scalars, Mask); 2575 } 2576 return false; 2577 } 2578 return IsSame(Scalars, ReuseShuffleIndices); 2579 } 2580 2581 bool isOperandGatherNode(const EdgeInfo &UserEI) const { 2582 return State == TreeEntry::NeedToGather && 2583 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && 2584 UserTreeIndices.front().UserTE == UserEI.UserTE; 2585 } 2586 2587 /// \returns true if current entry has same operands as \p TE. 2588 bool hasEqualOperands(const TreeEntry &TE) const { 2589 if (TE.getNumOperands() != getNumOperands()) 2590 return false; 2591 SmallBitVector Used(getNumOperands()); 2592 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { 2593 unsigned PrevCount = Used.count(); 2594 for (unsigned K = 0; K < E; ++K) { 2595 if (Used.test(K)) 2596 continue; 2597 if (getOperand(K) == TE.getOperand(I)) { 2598 Used.set(K); 2599 break; 2600 } 2601 } 2602 // Check if we actually found the matching operand. 2603 if (PrevCount == Used.count()) 2604 return false; 2605 } 2606 return true; 2607 } 2608 2609 /// \return Final vectorization factor for the node. Defined by the total 2610 /// number of vectorized scalars, including those, used several times in the 2611 /// entry and counted in the \a ReuseShuffleIndices, if any. 2612 unsigned getVectorFactor() const { 2613 if (!ReuseShuffleIndices.empty()) 2614 return ReuseShuffleIndices.size(); 2615 return Scalars.size(); 2616 }; 2617 2618 /// A vector of scalars. 2619 ValueList Scalars; 2620 2621 /// The Scalars are vectorized into this value. It is initialized to Null. 2622 WeakTrackingVH VectorizedValue = nullptr; 2623 2624 /// Do we need to gather this sequence or vectorize it 2625 /// (either with vector instruction or with scatter/gather 2626 /// intrinsics for store/load)? 2627 enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; 2628 EntryState State; 2629 2630 /// Does this sequence require some shuffling? 2631 SmallVector<int, 4> ReuseShuffleIndices; 2632 2633 /// Does this entry require reordering? 2634 SmallVector<unsigned, 4> ReorderIndices; 2635 2636 /// Points back to the VectorizableTree. 2637 /// 2638 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 2639 /// to be a pointer and needs to be able to initialize the child iterator. 2640 /// Thus we need a reference back to the container to translate the indices 2641 /// to entries. 2642 VecTreeTy &Container; 2643 2644 /// The TreeEntry index containing the user of this entry. We can actually 2645 /// have multiple users so the data structure is not truly a tree. 2646 SmallVector<EdgeInfo, 1> UserTreeIndices; 2647 2648 /// The index of this treeEntry in VectorizableTree. 2649 int Idx = -1; 2650 2651 private: 2652 /// The operands of each instruction in each lane Operands[op_index][lane]. 2653 /// Note: This helps avoid the replication of the code that performs the 2654 /// reordering of operands during buildTree_rec() and vectorizeTree(). 2655 SmallVector<ValueList, 2> Operands; 2656 2657 /// The main/alternate instruction. 2658 Instruction *MainOp = nullptr; 2659 Instruction *AltOp = nullptr; 2660 2661 public: 2662 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 2663 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { 2664 if (Operands.size() < OpIdx + 1) 2665 Operands.resize(OpIdx + 1); 2666 assert(Operands[OpIdx].empty() && "Already resized?"); 2667 assert(OpVL.size() <= Scalars.size() && 2668 "Number of operands is greater than the number of scalars."); 2669 Operands[OpIdx].resize(OpVL.size()); 2670 copy(OpVL, Operands[OpIdx].begin()); 2671 } 2672 2673 /// Set the operands of this bundle in their original order. 2674 void setOperandsInOrder() { 2675 assert(Operands.empty() && "Already initialized?"); 2676 auto *I0 = cast<Instruction>(Scalars[0]); 2677 Operands.resize(I0->getNumOperands()); 2678 unsigned NumLanes = Scalars.size(); 2679 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); 2680 OpIdx != NumOperands; ++OpIdx) { 2681 Operands[OpIdx].resize(NumLanes); 2682 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 2683 auto *I = cast<Instruction>(Scalars[Lane]); 2684 assert(I->getNumOperands() == NumOperands && 2685 "Expected same number of operands"); 2686 Operands[OpIdx][Lane] = I->getOperand(OpIdx); 2687 } 2688 } 2689 } 2690 2691 /// Reorders operands of the node to the given mask \p Mask. 2692 void reorderOperands(ArrayRef<int> Mask) { 2693 for (ValueList &Operand : Operands) 2694 reorderScalars(Operand, Mask); 2695 } 2696 2697 /// \returns the \p OpIdx operand of this TreeEntry. 2698 ValueList &getOperand(unsigned OpIdx) { 2699 assert(OpIdx < Operands.size() && "Off bounds"); 2700 return Operands[OpIdx]; 2701 } 2702 2703 /// \returns the \p OpIdx operand of this TreeEntry. 2704 ArrayRef<Value *> getOperand(unsigned OpIdx) const { 2705 assert(OpIdx < Operands.size() && "Off bounds"); 2706 return Operands[OpIdx]; 2707 } 2708 2709 /// \returns the number of operands. 2710 unsigned getNumOperands() const { return Operands.size(); } 2711 2712 /// \return the single \p OpIdx operand. 2713 Value *getSingleOperand(unsigned OpIdx) const { 2714 assert(OpIdx < Operands.size() && "Off bounds"); 2715 assert(!Operands[OpIdx].empty() && "No operand available"); 2716 return Operands[OpIdx][0]; 2717 } 2718 2719 /// Some of the instructions in the list have alternate opcodes. 2720 bool isAltShuffle() const { return MainOp != AltOp; } 2721 2722 bool isOpcodeOrAlt(Instruction *I) const { 2723 unsigned CheckedOpcode = I->getOpcode(); 2724 return (getOpcode() == CheckedOpcode || 2725 getAltOpcode() == CheckedOpcode); 2726 } 2727 2728 /// Chooses the correct key for scheduling data. If \p Op has the same (or 2729 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 2730 /// \p OpValue. 2731 Value *isOneOf(Value *Op) const { 2732 auto *I = dyn_cast<Instruction>(Op); 2733 if (I && isOpcodeOrAlt(I)) 2734 return Op; 2735 return MainOp; 2736 } 2737 2738 void setOperations(const InstructionsState &S) { 2739 MainOp = S.MainOp; 2740 AltOp = S.AltOp; 2741 } 2742 2743 Instruction *getMainOp() const { 2744 return MainOp; 2745 } 2746 2747 Instruction *getAltOp() const { 2748 return AltOp; 2749 } 2750 2751 /// The main/alternate opcodes for the list of instructions. 2752 unsigned getOpcode() const { 2753 return MainOp ? MainOp->getOpcode() : 0; 2754 } 2755 2756 unsigned getAltOpcode() const { 2757 return AltOp ? AltOp->getOpcode() : 0; 2758 } 2759 2760 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 2761 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 2762 int findLaneForValue(Value *V) const { 2763 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); 2764 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 2765 if (!ReorderIndices.empty()) 2766 FoundLane = ReorderIndices[FoundLane]; 2767 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 2768 if (!ReuseShuffleIndices.empty()) { 2769 FoundLane = std::distance(ReuseShuffleIndices.begin(), 2770 find(ReuseShuffleIndices, FoundLane)); 2771 } 2772 return FoundLane; 2773 } 2774 2775 #ifndef NDEBUG 2776 /// Debug printer. 2777 LLVM_DUMP_METHOD void dump() const { 2778 dbgs() << Idx << ".\n"; 2779 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { 2780 dbgs() << "Operand " << OpI << ":\n"; 2781 for (const Value *V : Operands[OpI]) 2782 dbgs().indent(2) << *V << "\n"; 2783 } 2784 dbgs() << "Scalars: \n"; 2785 for (Value *V : Scalars) 2786 dbgs().indent(2) << *V << "\n"; 2787 dbgs() << "State: "; 2788 switch (State) { 2789 case Vectorize: 2790 dbgs() << "Vectorize\n"; 2791 break; 2792 case ScatterVectorize: 2793 dbgs() << "ScatterVectorize\n"; 2794 break; 2795 case NeedToGather: 2796 dbgs() << "NeedToGather\n"; 2797 break; 2798 } 2799 dbgs() << "MainOp: "; 2800 if (MainOp) 2801 dbgs() << *MainOp << "\n"; 2802 else 2803 dbgs() << "NULL\n"; 2804 dbgs() << "AltOp: "; 2805 if (AltOp) 2806 dbgs() << *AltOp << "\n"; 2807 else 2808 dbgs() << "NULL\n"; 2809 dbgs() << "VectorizedValue: "; 2810 if (VectorizedValue) 2811 dbgs() << *VectorizedValue << "\n"; 2812 else 2813 dbgs() << "NULL\n"; 2814 dbgs() << "ReuseShuffleIndices: "; 2815 if (ReuseShuffleIndices.empty()) 2816 dbgs() << "Empty"; 2817 else 2818 for (int ReuseIdx : ReuseShuffleIndices) 2819 dbgs() << ReuseIdx << ", "; 2820 dbgs() << "\n"; 2821 dbgs() << "ReorderIndices: "; 2822 for (unsigned ReorderIdx : ReorderIndices) 2823 dbgs() << ReorderIdx << ", "; 2824 dbgs() << "\n"; 2825 dbgs() << "UserTreeIndices: "; 2826 for (const auto &EInfo : UserTreeIndices) 2827 dbgs() << EInfo << ", "; 2828 dbgs() << "\n"; 2829 } 2830 #endif 2831 }; 2832 2833 #ifndef NDEBUG 2834 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, 2835 InstructionCost VecCost, InstructionCost ScalarCost, 2836 StringRef Banner) const { 2837 dbgs() << "SLP: " << Banner << ":\n"; 2838 E->dump(); 2839 dbgs() << "SLP: Costs:\n"; 2840 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; 2841 dbgs() << "SLP: VectorCost = " << VecCost << "\n"; 2842 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; 2843 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " 2844 << ReuseShuffleCost + VecCost - ScalarCost << "\n"; 2845 } 2846 #endif 2847 2848 /// Create a new VectorizableTree entry. 2849 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 2850 std::optional<ScheduleData *> Bundle, 2851 const InstructionsState &S, 2852 const EdgeInfo &UserTreeIdx, 2853 ArrayRef<int> ReuseShuffleIndices = std::nullopt, 2854 ArrayRef<unsigned> ReorderIndices = std::nullopt) { 2855 TreeEntry::EntryState EntryState = 2856 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; 2857 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, 2858 ReuseShuffleIndices, ReorderIndices); 2859 } 2860 2861 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 2862 TreeEntry::EntryState EntryState, 2863 std::optional<ScheduleData *> Bundle, 2864 const InstructionsState &S, 2865 const EdgeInfo &UserTreeIdx, 2866 ArrayRef<int> ReuseShuffleIndices = std::nullopt, 2867 ArrayRef<unsigned> ReorderIndices = std::nullopt) { 2868 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || 2869 (Bundle && EntryState != TreeEntry::NeedToGather)) && 2870 "Need to vectorize gather entry?"); 2871 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); 2872 TreeEntry *Last = VectorizableTree.back().get(); 2873 Last->Idx = VectorizableTree.size() - 1; 2874 Last->State = EntryState; 2875 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), 2876 ReuseShuffleIndices.end()); 2877 if (ReorderIndices.empty()) { 2878 Last->Scalars.assign(VL.begin(), VL.end()); 2879 Last->setOperations(S); 2880 } else { 2881 // Reorder scalars and build final mask. 2882 Last->Scalars.assign(VL.size(), nullptr); 2883 transform(ReorderIndices, Last->Scalars.begin(), 2884 [VL](unsigned Idx) -> Value * { 2885 if (Idx >= VL.size()) 2886 return UndefValue::get(VL.front()->getType()); 2887 return VL[Idx]; 2888 }); 2889 InstructionsState S = getSameOpcode(Last->Scalars, *TLI); 2890 Last->setOperations(S); 2891 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); 2892 } 2893 if (Last->State != TreeEntry::NeedToGather) { 2894 for (Value *V : VL) { 2895 assert(!getTreeEntry(V) && "Scalar already in tree!"); 2896 ScalarToTreeEntry[V] = Last; 2897 } 2898 // Update the scheduler bundle to point to this TreeEntry. 2899 ScheduleData *BundleMember = *Bundle; 2900 assert((BundleMember || isa<PHINode>(S.MainOp) || 2901 isVectorLikeInstWithConstOps(S.MainOp) || 2902 doesNotNeedToSchedule(VL)) && 2903 "Bundle and VL out of sync"); 2904 if (BundleMember) { 2905 for (Value *V : VL) { 2906 if (doesNotNeedToBeScheduled(V)) 2907 continue; 2908 assert(BundleMember && "Unexpected end of bundle."); 2909 BundleMember->TE = Last; 2910 BundleMember = BundleMember->NextInBundle; 2911 } 2912 } 2913 assert(!BundleMember && "Bundle and VL out of sync"); 2914 } else { 2915 MustGather.insert(VL.begin(), VL.end()); 2916 } 2917 2918 if (UserTreeIdx.UserTE) 2919 Last->UserTreeIndices.push_back(UserTreeIdx); 2920 2921 return Last; 2922 } 2923 2924 /// -- Vectorization State -- 2925 /// Holds all of the tree entries. 2926 TreeEntry::VecTreeTy VectorizableTree; 2927 2928 #ifndef NDEBUG 2929 /// Debug printer. 2930 LLVM_DUMP_METHOD void dumpVectorizableTree() const { 2931 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { 2932 VectorizableTree[Id]->dump(); 2933 dbgs() << "\n"; 2934 } 2935 } 2936 #endif 2937 2938 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } 2939 2940 const TreeEntry *getTreeEntry(Value *V) const { 2941 return ScalarToTreeEntry.lookup(V); 2942 } 2943 2944 /// Checks if the specified list of the instructions/values can be vectorized 2945 /// and fills required data before actual scheduling of the instructions. 2946 TreeEntry::EntryState getScalarsVectorizationState( 2947 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, 2948 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const; 2949 2950 /// Maps a specific scalar to its tree entry. 2951 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry; 2952 2953 /// Maps a value to the proposed vectorizable size. 2954 SmallDenseMap<Value *, unsigned> InstrElementSize; 2955 2956 /// A list of scalars that we found that we need to keep as scalars. 2957 ValueSet MustGather; 2958 2959 /// A map between the vectorized entries and the last instructions in the 2960 /// bundles. The bundles are built in use order, not in the def order of the 2961 /// instructions. So, we cannot rely directly on the last instruction in the 2962 /// bundle being the last instruction in the program order during 2963 /// vectorization process since the basic blocks are affected, need to 2964 /// pre-gather them before. 2965 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction; 2966 2967 /// List of gather nodes, depending on other gather/vector nodes, which should 2968 /// be emitted after the vector instruction emission process to correctly 2969 /// handle order of the vector instructions and shuffles. 2970 SetVector<const TreeEntry *> PostponedGathers; 2971 2972 using ValueToGatherNodesMap = 2973 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>; 2974 ValueToGatherNodesMap ValueToGatherNodes; 2975 2976 /// This POD struct describes one external user in the vectorized tree. 2977 struct ExternalUser { 2978 ExternalUser(Value *S, llvm::User *U, int L) 2979 : Scalar(S), User(U), Lane(L) {} 2980 2981 // Which scalar in our function. 2982 Value *Scalar; 2983 2984 // Which user that uses the scalar. 2985 llvm::User *User; 2986 2987 // Which lane does the scalar belong to. 2988 int Lane; 2989 }; 2990 using UserList = SmallVector<ExternalUser, 16>; 2991 2992 /// Checks if two instructions may access the same memory. 2993 /// 2994 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 2995 /// is invariant in the calling loop. 2996 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 2997 Instruction *Inst2) { 2998 // First check if the result is already in the cache. 2999 AliasCacheKey key = std::make_pair(Inst1, Inst2); 3000 std::optional<bool> &result = AliasCache[key]; 3001 if (result) { 3002 return *result; 3003 } 3004 bool aliased = true; 3005 if (Loc1.Ptr && isSimple(Inst1)) 3006 aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); 3007 // Store the result in the cache. 3008 result = aliased; 3009 return aliased; 3010 } 3011 3012 using AliasCacheKey = std::pair<Instruction *, Instruction *>; 3013 3014 /// Cache for alias results. 3015 /// TODO: consider moving this to the AliasAnalysis itself. 3016 DenseMap<AliasCacheKey, std::optional<bool>> AliasCache; 3017 3018 // Cache for pointerMayBeCaptured calls inside AA. This is preserved 3019 // globally through SLP because we don't perform any action which 3020 // invalidates capture results. 3021 BatchAAResults BatchAA; 3022 3023 /// Temporary store for deleted instructions. Instructions will be deleted 3024 /// eventually when the BoUpSLP is destructed. The deferral is required to 3025 /// ensure that there are no incorrect collisions in the AliasCache, which 3026 /// can happen if a new instruction is allocated at the same address as a 3027 /// previously deleted instruction. 3028 DenseSet<Instruction *> DeletedInstructions; 3029 3030 /// Set of the instruction, being analyzed already for reductions. 3031 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots; 3032 3033 /// Set of hashes for the list of reduction values already being analyzed. 3034 DenseSet<size_t> AnalyzedReductionVals; 3035 3036 /// A list of values that need to extracted out of the tree. 3037 /// This list holds pairs of (Internal Scalar : External User). External User 3038 /// can be nullptr, it means that this Internal Scalar will be used later, 3039 /// after vectorization. 3040 UserList ExternalUses; 3041 3042 /// Values used only by @llvm.assume calls. 3043 SmallPtrSet<const Value *, 32> EphValues; 3044 3045 /// Holds all of the instructions that we gathered, shuffle instructions and 3046 /// extractelements. 3047 SetVector<Instruction *> GatherShuffleExtractSeq; 3048 3049 /// A list of blocks that we are going to CSE. 3050 SetVector<BasicBlock *> CSEBlocks; 3051 3052 /// Contains all scheduling relevant data for an instruction. 3053 /// A ScheduleData either represents a single instruction or a member of an 3054 /// instruction bundle (= a group of instructions which is combined into a 3055 /// vector instruction). 3056 struct ScheduleData { 3057 // The initial value for the dependency counters. It means that the 3058 // dependencies are not calculated yet. 3059 enum { InvalidDeps = -1 }; 3060 3061 ScheduleData() = default; 3062 3063 void init(int BlockSchedulingRegionID, Value *OpVal) { 3064 FirstInBundle = this; 3065 NextInBundle = nullptr; 3066 NextLoadStore = nullptr; 3067 IsScheduled = false; 3068 SchedulingRegionID = BlockSchedulingRegionID; 3069 clearDependencies(); 3070 OpValue = OpVal; 3071 TE = nullptr; 3072 } 3073 3074 /// Verify basic self consistency properties 3075 void verify() { 3076 if (hasValidDependencies()) { 3077 assert(UnscheduledDeps <= Dependencies && "invariant"); 3078 } else { 3079 assert(UnscheduledDeps == Dependencies && "invariant"); 3080 } 3081 3082 if (IsScheduled) { 3083 assert(isSchedulingEntity() && 3084 "unexpected scheduled state"); 3085 for (const ScheduleData *BundleMember = this; BundleMember; 3086 BundleMember = BundleMember->NextInBundle) { 3087 assert(BundleMember->hasValidDependencies() && 3088 BundleMember->UnscheduledDeps == 0 && 3089 "unexpected scheduled state"); 3090 assert((BundleMember == this || !BundleMember->IsScheduled) && 3091 "only bundle is marked scheduled"); 3092 } 3093 } 3094 3095 assert(Inst->getParent() == FirstInBundle->Inst->getParent() && 3096 "all bundle members must be in same basic block"); 3097 } 3098 3099 /// Returns true if the dependency information has been calculated. 3100 /// Note that depenendency validity can vary between instructions within 3101 /// a single bundle. 3102 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 3103 3104 /// Returns true for single instructions and for bundle representatives 3105 /// (= the head of a bundle). 3106 bool isSchedulingEntity() const { return FirstInBundle == this; } 3107 3108 /// Returns true if it represents an instruction bundle and not only a 3109 /// single instruction. 3110 bool isPartOfBundle() const { 3111 return NextInBundle != nullptr || FirstInBundle != this || TE; 3112 } 3113 3114 /// Returns true if it is ready for scheduling, i.e. it has no more 3115 /// unscheduled depending instructions/bundles. 3116 bool isReady() const { 3117 assert(isSchedulingEntity() && 3118 "can't consider non-scheduling entity for ready list"); 3119 return unscheduledDepsInBundle() == 0 && !IsScheduled; 3120 } 3121 3122 /// Modifies the number of unscheduled dependencies for this instruction, 3123 /// and returns the number of remaining dependencies for the containing 3124 /// bundle. 3125 int incrementUnscheduledDeps(int Incr) { 3126 assert(hasValidDependencies() && 3127 "increment of unscheduled deps would be meaningless"); 3128 UnscheduledDeps += Incr; 3129 return FirstInBundle->unscheduledDepsInBundle(); 3130 } 3131 3132 /// Sets the number of unscheduled dependencies to the number of 3133 /// dependencies. 3134 void resetUnscheduledDeps() { 3135 UnscheduledDeps = Dependencies; 3136 } 3137 3138 /// Clears all dependency information. 3139 void clearDependencies() { 3140 Dependencies = InvalidDeps; 3141 resetUnscheduledDeps(); 3142 MemoryDependencies.clear(); 3143 ControlDependencies.clear(); 3144 } 3145 3146 int unscheduledDepsInBundle() const { 3147 assert(isSchedulingEntity() && "only meaningful on the bundle"); 3148 int Sum = 0; 3149 for (const ScheduleData *BundleMember = this; BundleMember; 3150 BundleMember = BundleMember->NextInBundle) { 3151 if (BundleMember->UnscheduledDeps == InvalidDeps) 3152 return InvalidDeps; 3153 Sum += BundleMember->UnscheduledDeps; 3154 } 3155 return Sum; 3156 } 3157 3158 void dump(raw_ostream &os) const { 3159 if (!isSchedulingEntity()) { 3160 os << "/ " << *Inst; 3161 } else if (NextInBundle) { 3162 os << '[' << *Inst; 3163 ScheduleData *SD = NextInBundle; 3164 while (SD) { 3165 os << ';' << *SD->Inst; 3166 SD = SD->NextInBundle; 3167 } 3168 os << ']'; 3169 } else { 3170 os << *Inst; 3171 } 3172 } 3173 3174 Instruction *Inst = nullptr; 3175 3176 /// Opcode of the current instruction in the schedule data. 3177 Value *OpValue = nullptr; 3178 3179 /// The TreeEntry that this instruction corresponds to. 3180 TreeEntry *TE = nullptr; 3181 3182 /// Points to the head in an instruction bundle (and always to this for 3183 /// single instructions). 3184 ScheduleData *FirstInBundle = nullptr; 3185 3186 /// Single linked list of all instructions in a bundle. Null if it is a 3187 /// single instruction. 3188 ScheduleData *NextInBundle = nullptr; 3189 3190 /// Single linked list of all memory instructions (e.g. load, store, call) 3191 /// in the block - until the end of the scheduling region. 3192 ScheduleData *NextLoadStore = nullptr; 3193 3194 /// The dependent memory instructions. 3195 /// This list is derived on demand in calculateDependencies(). 3196 SmallVector<ScheduleData *, 4> MemoryDependencies; 3197 3198 /// List of instructions which this instruction could be control dependent 3199 /// on. Allowing such nodes to be scheduled below this one could introduce 3200 /// a runtime fault which didn't exist in the original program. 3201 /// ex: this is a load or udiv following a readonly call which inf loops 3202 SmallVector<ScheduleData *, 4> ControlDependencies; 3203 3204 /// This ScheduleData is in the current scheduling region if this matches 3205 /// the current SchedulingRegionID of BlockScheduling. 3206 int SchedulingRegionID = 0; 3207 3208 /// Used for getting a "good" final ordering of instructions. 3209 int SchedulingPriority = 0; 3210 3211 /// The number of dependencies. Constitutes of the number of users of the 3212 /// instruction plus the number of dependent memory instructions (if any). 3213 /// This value is calculated on demand. 3214 /// If InvalidDeps, the number of dependencies is not calculated yet. 3215 int Dependencies = InvalidDeps; 3216 3217 /// The number of dependencies minus the number of dependencies of scheduled 3218 /// instructions. As soon as this is zero, the instruction/bundle gets ready 3219 /// for scheduling. 3220 /// Note that this is negative as long as Dependencies is not calculated. 3221 int UnscheduledDeps = InvalidDeps; 3222 3223 /// True if this instruction is scheduled (or considered as scheduled in the 3224 /// dry-run). 3225 bool IsScheduled = false; 3226 }; 3227 3228 #ifndef NDEBUG 3229 friend inline raw_ostream &operator<<(raw_ostream &os, 3230 const BoUpSLP::ScheduleData &SD) { 3231 SD.dump(os); 3232 return os; 3233 } 3234 #endif 3235 3236 friend struct GraphTraits<BoUpSLP *>; 3237 friend struct DOTGraphTraits<BoUpSLP *>; 3238 3239 /// Contains all scheduling data for a basic block. 3240 /// It does not schedules instructions, which are not memory read/write 3241 /// instructions and their operands are either constants, or arguments, or 3242 /// phis, or instructions from others blocks, or their users are phis or from 3243 /// the other blocks. The resulting vector instructions can be placed at the 3244 /// beginning of the basic block without scheduling (if operands does not need 3245 /// to be scheduled) or at the end of the block (if users are outside of the 3246 /// block). It allows to save some compile time and memory used by the 3247 /// compiler. 3248 /// ScheduleData is assigned for each instruction in between the boundaries of 3249 /// the tree entry, even for those, which are not part of the graph. It is 3250 /// required to correctly follow the dependencies between the instructions and 3251 /// their correct scheduling. The ScheduleData is not allocated for the 3252 /// instructions, which do not require scheduling, like phis, nodes with 3253 /// extractelements/insertelements only or nodes with instructions, with 3254 /// uses/operands outside of the block. 3255 struct BlockScheduling { 3256 BlockScheduling(BasicBlock *BB) 3257 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} 3258 3259 void clear() { 3260 ReadyInsts.clear(); 3261 ScheduleStart = nullptr; 3262 ScheduleEnd = nullptr; 3263 FirstLoadStoreInRegion = nullptr; 3264 LastLoadStoreInRegion = nullptr; 3265 RegionHasStackSave = false; 3266 3267 // Reduce the maximum schedule region size by the size of the 3268 // previous scheduling run. 3269 ScheduleRegionSizeLimit -= ScheduleRegionSize; 3270 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 3271 ScheduleRegionSizeLimit = MinScheduleRegionSize; 3272 ScheduleRegionSize = 0; 3273 3274 // Make a new scheduling region, i.e. all existing ScheduleData is not 3275 // in the new region yet. 3276 ++SchedulingRegionID; 3277 } 3278 3279 ScheduleData *getScheduleData(Instruction *I) { 3280 if (BB != I->getParent()) 3281 // Avoid lookup if can't possibly be in map. 3282 return nullptr; 3283 ScheduleData *SD = ScheduleDataMap.lookup(I); 3284 if (SD && isInSchedulingRegion(SD)) 3285 return SD; 3286 return nullptr; 3287 } 3288 3289 ScheduleData *getScheduleData(Value *V) { 3290 if (auto *I = dyn_cast<Instruction>(V)) 3291 return getScheduleData(I); 3292 return nullptr; 3293 } 3294 3295 ScheduleData *getScheduleData(Value *V, Value *Key) { 3296 if (V == Key) 3297 return getScheduleData(V); 3298 auto I = ExtraScheduleDataMap.find(V); 3299 if (I != ExtraScheduleDataMap.end()) { 3300 ScheduleData *SD = I->second.lookup(Key); 3301 if (SD && isInSchedulingRegion(SD)) 3302 return SD; 3303 } 3304 return nullptr; 3305 } 3306 3307 bool isInSchedulingRegion(ScheduleData *SD) const { 3308 return SD->SchedulingRegionID == SchedulingRegionID; 3309 } 3310 3311 /// Marks an instruction as scheduled and puts all dependent ready 3312 /// instructions into the ready-list. 3313 template <typename ReadyListType> 3314 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 3315 SD->IsScheduled = true; 3316 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 3317 3318 for (ScheduleData *BundleMember = SD; BundleMember; 3319 BundleMember = BundleMember->NextInBundle) { 3320 if (BundleMember->Inst != BundleMember->OpValue) 3321 continue; 3322 3323 // Handle the def-use chain dependencies. 3324 3325 // Decrement the unscheduled counter and insert to ready list if ready. 3326 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { 3327 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { 3328 if (OpDef && OpDef->hasValidDependencies() && 3329 OpDef->incrementUnscheduledDeps(-1) == 0) { 3330 // There are no more unscheduled dependencies after 3331 // decrementing, so we can put the dependent instruction 3332 // into the ready list. 3333 ScheduleData *DepBundle = OpDef->FirstInBundle; 3334 assert(!DepBundle->IsScheduled && 3335 "already scheduled bundle gets ready"); 3336 ReadyList.insert(DepBundle); 3337 LLVM_DEBUG(dbgs() 3338 << "SLP: gets ready (def): " << *DepBundle << "\n"); 3339 } 3340 }); 3341 }; 3342 3343 // If BundleMember is a vector bundle, its operands may have been 3344 // reordered during buildTree(). We therefore need to get its operands 3345 // through the TreeEntry. 3346 if (TreeEntry *TE = BundleMember->TE) { 3347 // Need to search for the lane since the tree entry can be reordered. 3348 int Lane = std::distance(TE->Scalars.begin(), 3349 find(TE->Scalars, BundleMember->Inst)); 3350 assert(Lane >= 0 && "Lane not set"); 3351 3352 // Since vectorization tree is being built recursively this assertion 3353 // ensures that the tree entry has all operands set before reaching 3354 // this code. Couple of exceptions known at the moment are extracts 3355 // where their second (immediate) operand is not added. Since 3356 // immediates do not affect scheduler behavior this is considered 3357 // okay. 3358 auto *In = BundleMember->Inst; 3359 assert(In && 3360 (isa<ExtractValueInst, ExtractElementInst>(In) || 3361 In->getNumOperands() == TE->getNumOperands()) && 3362 "Missed TreeEntry operands?"); 3363 (void)In; // fake use to avoid build failure when assertions disabled 3364 3365 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); 3366 OpIdx != NumOperands; ++OpIdx) 3367 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) 3368 DecrUnsched(I); 3369 } else { 3370 // If BundleMember is a stand-alone instruction, no operand reordering 3371 // has taken place, so we directly access its operands. 3372 for (Use &U : BundleMember->Inst->operands()) 3373 if (auto *I = dyn_cast<Instruction>(U.get())) 3374 DecrUnsched(I); 3375 } 3376 // Handle the memory dependencies. 3377 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 3378 if (MemoryDepSD->hasValidDependencies() && 3379 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 3380 // There are no more unscheduled dependencies after decrementing, 3381 // so we can put the dependent instruction into the ready list. 3382 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 3383 assert(!DepBundle->IsScheduled && 3384 "already scheduled bundle gets ready"); 3385 ReadyList.insert(DepBundle); 3386 LLVM_DEBUG(dbgs() 3387 << "SLP: gets ready (mem): " << *DepBundle << "\n"); 3388 } 3389 } 3390 // Handle the control dependencies. 3391 for (ScheduleData *DepSD : BundleMember->ControlDependencies) { 3392 if (DepSD->incrementUnscheduledDeps(-1) == 0) { 3393 // There are no more unscheduled dependencies after decrementing, 3394 // so we can put the dependent instruction into the ready list. 3395 ScheduleData *DepBundle = DepSD->FirstInBundle; 3396 assert(!DepBundle->IsScheduled && 3397 "already scheduled bundle gets ready"); 3398 ReadyList.insert(DepBundle); 3399 LLVM_DEBUG(dbgs() 3400 << "SLP: gets ready (ctl): " << *DepBundle << "\n"); 3401 } 3402 } 3403 } 3404 } 3405 3406 /// Verify basic self consistency properties of the data structure. 3407 void verify() { 3408 if (!ScheduleStart) 3409 return; 3410 3411 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && 3412 ScheduleStart->comesBefore(ScheduleEnd) && 3413 "Not a valid scheduling region?"); 3414 3415 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3416 auto *SD = getScheduleData(I); 3417 if (!SD) 3418 continue; 3419 assert(isInSchedulingRegion(SD) && 3420 "primary schedule data not in window?"); 3421 assert(isInSchedulingRegion(SD->FirstInBundle) && 3422 "entire bundle in window!"); 3423 (void)SD; 3424 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); }); 3425 } 3426 3427 for (auto *SD : ReadyInsts) { 3428 assert(SD->isSchedulingEntity() && SD->isReady() && 3429 "item in ready list not ready?"); 3430 (void)SD; 3431 } 3432 } 3433 3434 void doForAllOpcodes(Value *V, 3435 function_ref<void(ScheduleData *SD)> Action) { 3436 if (ScheduleData *SD = getScheduleData(V)) 3437 Action(SD); 3438 auto I = ExtraScheduleDataMap.find(V); 3439 if (I != ExtraScheduleDataMap.end()) 3440 for (auto &P : I->second) 3441 if (isInSchedulingRegion(P.second)) 3442 Action(P.second); 3443 } 3444 3445 /// Put all instructions into the ReadyList which are ready for scheduling. 3446 template <typename ReadyListType> 3447 void initialFillReadyList(ReadyListType &ReadyList) { 3448 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3449 doForAllOpcodes(I, [&](ScheduleData *SD) { 3450 if (SD->isSchedulingEntity() && SD->hasValidDependencies() && 3451 SD->isReady()) { 3452 ReadyList.insert(SD); 3453 LLVM_DEBUG(dbgs() 3454 << "SLP: initially in ready list: " << *SD << "\n"); 3455 } 3456 }); 3457 } 3458 } 3459 3460 /// Build a bundle from the ScheduleData nodes corresponding to the 3461 /// scalar instruction for each lane. 3462 ScheduleData *buildBundle(ArrayRef<Value *> VL); 3463 3464 /// Checks if a bundle of instructions can be scheduled, i.e. has no 3465 /// cyclic dependencies. This is only a dry-run, no instructions are 3466 /// actually moved at this stage. 3467 /// \returns the scheduling bundle. The returned Optional value is not 3468 /// std::nullopt if \p VL is allowed to be scheduled. 3469 std::optional<ScheduleData *> 3470 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 3471 const InstructionsState &S); 3472 3473 /// Un-bundles a group of instructions. 3474 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); 3475 3476 /// Allocates schedule data chunk. 3477 ScheduleData *allocateScheduleDataChunks(); 3478 3479 /// Extends the scheduling region so that V is inside the region. 3480 /// \returns true if the region size is within the limit. 3481 bool extendSchedulingRegion(Value *V, const InstructionsState &S); 3482 3483 /// Initialize the ScheduleData structures for new instructions in the 3484 /// scheduling region. 3485 void initScheduleData(Instruction *FromI, Instruction *ToI, 3486 ScheduleData *PrevLoadStore, 3487 ScheduleData *NextLoadStore); 3488 3489 /// Updates the dependency information of a bundle and of all instructions/ 3490 /// bundles which depend on the original bundle. 3491 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 3492 BoUpSLP *SLP); 3493 3494 /// Sets all instruction in the scheduling region to un-scheduled. 3495 void resetSchedule(); 3496 3497 BasicBlock *BB; 3498 3499 /// Simple memory allocation for ScheduleData. 3500 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 3501 3502 /// The size of a ScheduleData array in ScheduleDataChunks. 3503 int ChunkSize; 3504 3505 /// The allocator position in the current chunk, which is the last entry 3506 /// of ScheduleDataChunks. 3507 int ChunkPos; 3508 3509 /// Attaches ScheduleData to Instruction. 3510 /// Note that the mapping survives during all vectorization iterations, i.e. 3511 /// ScheduleData structures are recycled. 3512 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap; 3513 3514 /// Attaches ScheduleData to Instruction with the leading key. 3515 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> 3516 ExtraScheduleDataMap; 3517 3518 /// The ready-list for scheduling (only used for the dry-run). 3519 SetVector<ScheduleData *> ReadyInsts; 3520 3521 /// The first instruction of the scheduling region. 3522 Instruction *ScheduleStart = nullptr; 3523 3524 /// The first instruction _after_ the scheduling region. 3525 Instruction *ScheduleEnd = nullptr; 3526 3527 /// The first memory accessing instruction in the scheduling region 3528 /// (can be null). 3529 ScheduleData *FirstLoadStoreInRegion = nullptr; 3530 3531 /// The last memory accessing instruction in the scheduling region 3532 /// (can be null). 3533 ScheduleData *LastLoadStoreInRegion = nullptr; 3534 3535 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling 3536 /// region? Used to optimize the dependence calculation for the 3537 /// common case where there isn't. 3538 bool RegionHasStackSave = false; 3539 3540 /// The current size of the scheduling region. 3541 int ScheduleRegionSize = 0; 3542 3543 /// The maximum size allowed for the scheduling region. 3544 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; 3545 3546 /// The ID of the scheduling region. For a new vectorization iteration this 3547 /// is incremented which "removes" all ScheduleData from the region. 3548 /// Make sure that the initial SchedulingRegionID is greater than the 3549 /// initial SchedulingRegionID in ScheduleData (which is 0). 3550 int SchedulingRegionID = 1; 3551 }; 3552 3553 /// Attaches the BlockScheduling structures to basic blocks. 3554 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 3555 3556 /// Performs the "real" scheduling. Done before vectorization is actually 3557 /// performed in a basic block. 3558 void scheduleBlock(BlockScheduling *BS); 3559 3560 /// List of users to ignore during scheduling and that don't need extracting. 3561 const SmallDenseSet<Value *> *UserIgnoreList = nullptr; 3562 3563 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 3564 /// sorted SmallVectors of unsigned. 3565 struct OrdersTypeDenseMapInfo { 3566 static OrdersType getEmptyKey() { 3567 OrdersType V; 3568 V.push_back(~1U); 3569 return V; 3570 } 3571 3572 static OrdersType getTombstoneKey() { 3573 OrdersType V; 3574 V.push_back(~2U); 3575 return V; 3576 } 3577 3578 static unsigned getHashValue(const OrdersType &V) { 3579 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 3580 } 3581 3582 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { 3583 return LHS == RHS; 3584 } 3585 }; 3586 3587 // Analysis and block reference. 3588 Function *F; 3589 ScalarEvolution *SE; 3590 TargetTransformInfo *TTI; 3591 TargetLibraryInfo *TLI; 3592 LoopInfo *LI; 3593 DominatorTree *DT; 3594 AssumptionCache *AC; 3595 DemandedBits *DB; 3596 const DataLayout *DL; 3597 OptimizationRemarkEmitter *ORE; 3598 3599 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 3600 unsigned MinVecRegSize; // Set by cl::opt (default: 128). 3601 3602 /// Instruction builder to construct the vectorized tree. 3603 IRBuilder<> Builder; 3604 3605 /// A map of scalar integer values to the smallest bit width with which they 3606 /// can legally be represented. The values map to (width, signed) pairs, 3607 /// where "width" indicates the minimum bit width and "signed" is True if the 3608 /// value must be signed-extended, rather than zero-extended, back to its 3609 /// original width. 3610 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; 3611 }; 3612 3613 } // end namespace slpvectorizer 3614 3615 template <> struct GraphTraits<BoUpSLP *> { 3616 using TreeEntry = BoUpSLP::TreeEntry; 3617 3618 /// NodeRef has to be a pointer per the GraphWriter. 3619 using NodeRef = TreeEntry *; 3620 3621 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; 3622 3623 /// Add the VectorizableTree to the index iterator to be able to return 3624 /// TreeEntry pointers. 3625 struct ChildIteratorType 3626 : public iterator_adaptor_base< 3627 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { 3628 ContainerTy &VectorizableTree; 3629 3630 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, 3631 ContainerTy &VT) 3632 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} 3633 3634 NodeRef operator*() { return I->UserTE; } 3635 }; 3636 3637 static NodeRef getEntryNode(BoUpSLP &R) { 3638 return R.VectorizableTree[0].get(); 3639 } 3640 3641 static ChildIteratorType child_begin(NodeRef N) { 3642 return {N->UserTreeIndices.begin(), N->Container}; 3643 } 3644 3645 static ChildIteratorType child_end(NodeRef N) { 3646 return {N->UserTreeIndices.end(), N->Container}; 3647 } 3648 3649 /// For the node iterator we just need to turn the TreeEntry iterator into a 3650 /// TreeEntry* iterator so that it dereferences to NodeRef. 3651 class nodes_iterator { 3652 using ItTy = ContainerTy::iterator; 3653 ItTy It; 3654 3655 public: 3656 nodes_iterator(const ItTy &It2) : It(It2) {} 3657 NodeRef operator*() { return It->get(); } 3658 nodes_iterator operator++() { 3659 ++It; 3660 return *this; 3661 } 3662 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } 3663 }; 3664 3665 static nodes_iterator nodes_begin(BoUpSLP *R) { 3666 return nodes_iterator(R->VectorizableTree.begin()); 3667 } 3668 3669 static nodes_iterator nodes_end(BoUpSLP *R) { 3670 return nodes_iterator(R->VectorizableTree.end()); 3671 } 3672 3673 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } 3674 }; 3675 3676 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { 3677 using TreeEntry = BoUpSLP::TreeEntry; 3678 3679 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} 3680 3681 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { 3682 std::string Str; 3683 raw_string_ostream OS(Str); 3684 OS << Entry->Idx << ".\n"; 3685 if (isSplat(Entry->Scalars)) 3686 OS << "<splat> "; 3687 for (auto *V : Entry->Scalars) { 3688 OS << *V; 3689 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { 3690 return EU.Scalar == V; 3691 })) 3692 OS << " <extract>"; 3693 OS << "\n"; 3694 } 3695 return Str; 3696 } 3697 3698 static std::string getNodeAttributes(const TreeEntry *Entry, 3699 const BoUpSLP *) { 3700 if (Entry->State == TreeEntry::NeedToGather) 3701 return "color=red"; 3702 if (Entry->State == TreeEntry::ScatterVectorize) 3703 return "color=blue"; 3704 return ""; 3705 } 3706 }; 3707 3708 } // end namespace llvm 3709 3710 BoUpSLP::~BoUpSLP() { 3711 SmallVector<WeakTrackingVH> DeadInsts; 3712 for (auto *I : DeletedInstructions) { 3713 for (Use &U : I->operands()) { 3714 auto *Op = dyn_cast<Instruction>(U.get()); 3715 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && 3716 wouldInstructionBeTriviallyDead(Op, TLI)) 3717 DeadInsts.emplace_back(Op); 3718 } 3719 I->dropAllReferences(); 3720 } 3721 for (auto *I : DeletedInstructions) { 3722 assert(I->use_empty() && 3723 "trying to erase instruction with users."); 3724 I->eraseFromParent(); 3725 } 3726 3727 // Cleanup any dead scalar code feeding the vectorized instructions 3728 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); 3729 3730 #ifdef EXPENSIVE_CHECKS 3731 // If we could guarantee that this call is not extremely slow, we could 3732 // remove the ifdef limitation (see PR47712). 3733 assert(!verifyFunction(*F, &dbgs())); 3734 #endif 3735 } 3736 3737 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 3738 /// contains original mask for the scalars reused in the node. Procedure 3739 /// transform this mask in accordance with the given \p Mask. 3740 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { 3741 assert(!Mask.empty() && Reuses.size() == Mask.size() && 3742 "Expected non-empty mask."); 3743 SmallVector<int> Prev(Reuses.begin(), Reuses.end()); 3744 Prev.swap(Reuses); 3745 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 3746 if (Mask[I] != PoisonMaskElem) 3747 Reuses[Mask[I]] = Prev[I]; 3748 } 3749 3750 /// Reorders the given \p Order according to the given \p Mask. \p Order - is 3751 /// the original order of the scalars. Procedure transforms the provided order 3752 /// in accordance with the given \p Mask. If the resulting \p Order is just an 3753 /// identity order, \p Order is cleared. 3754 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { 3755 assert(!Mask.empty() && "Expected non-empty mask."); 3756 SmallVector<int> MaskOrder; 3757 if (Order.empty()) { 3758 MaskOrder.resize(Mask.size()); 3759 std::iota(MaskOrder.begin(), MaskOrder.end(), 0); 3760 } else { 3761 inversePermutation(Order, MaskOrder); 3762 } 3763 reorderReuses(MaskOrder, Mask); 3764 if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { 3765 Order.clear(); 3766 return; 3767 } 3768 Order.assign(Mask.size(), Mask.size()); 3769 for (unsigned I = 0, E = Mask.size(); I < E; ++I) 3770 if (MaskOrder[I] != PoisonMaskElem) 3771 Order[MaskOrder[I]] = I; 3772 fixupOrderingIndices(Order); 3773 } 3774 3775 std::optional<BoUpSLP::OrdersType> 3776 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { 3777 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); 3778 unsigned NumScalars = TE.Scalars.size(); 3779 OrdersType CurrentOrder(NumScalars, NumScalars); 3780 SmallVector<int> Positions; 3781 SmallBitVector UsedPositions(NumScalars); 3782 const TreeEntry *STE = nullptr; 3783 // Try to find all gathered scalars that are gets vectorized in other 3784 // vectorize node. Here we can have only one single tree vector node to 3785 // correctly identify order of the gathered scalars. 3786 for (unsigned I = 0; I < NumScalars; ++I) { 3787 Value *V = TE.Scalars[I]; 3788 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) 3789 continue; 3790 if (const auto *LocalSTE = getTreeEntry(V)) { 3791 if (!STE) 3792 STE = LocalSTE; 3793 else if (STE != LocalSTE) 3794 // Take the order only from the single vector node. 3795 return std::nullopt; 3796 unsigned Lane = 3797 std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); 3798 if (Lane >= NumScalars) 3799 return std::nullopt; 3800 if (CurrentOrder[Lane] != NumScalars) { 3801 if (Lane != I) 3802 continue; 3803 UsedPositions.reset(CurrentOrder[Lane]); 3804 } 3805 // The partial identity (where only some elements of the gather node are 3806 // in the identity order) is good. 3807 CurrentOrder[Lane] = I; 3808 UsedPositions.set(I); 3809 } 3810 } 3811 // Need to keep the order if we have a vector entry and at least 2 scalars or 3812 // the vectorized entry has just 2 scalars. 3813 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { 3814 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) { 3815 for (unsigned I = 0; I < NumScalars; ++I) 3816 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) 3817 return false; 3818 return true; 3819 }; 3820 if (IsIdentityOrder(CurrentOrder)) 3821 return OrdersType(); 3822 auto *It = CurrentOrder.begin(); 3823 for (unsigned I = 0; I < NumScalars;) { 3824 if (UsedPositions.test(I)) { 3825 ++I; 3826 continue; 3827 } 3828 if (*It == NumScalars) { 3829 *It = I; 3830 ++I; 3831 } 3832 ++It; 3833 } 3834 return std::move(CurrentOrder); 3835 } 3836 return std::nullopt; 3837 } 3838 3839 namespace { 3840 /// Tracks the state we can represent the loads in the given sequence. 3841 enum class LoadsState { Gather, Vectorize, ScatterVectorize }; 3842 } // anonymous namespace 3843 3844 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, 3845 const TargetLibraryInfo &TLI, 3846 bool CompareOpcodes = true) { 3847 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) 3848 return false; 3849 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); 3850 if (!GEP1) 3851 return false; 3852 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); 3853 if (!GEP2) 3854 return false; 3855 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 && 3856 ((isConstant(GEP1->getOperand(1)) && 3857 isConstant(GEP2->getOperand(1))) || 3858 !CompareOpcodes || 3859 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) 3860 .getOpcode()); 3861 } 3862 3863 /// Checks if the given array of loads can be represented as a vectorized, 3864 /// scatter or just simple gather. 3865 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, 3866 const TargetTransformInfo &TTI, 3867 const DataLayout &DL, ScalarEvolution &SE, 3868 LoopInfo &LI, const TargetLibraryInfo &TLI, 3869 SmallVectorImpl<unsigned> &Order, 3870 SmallVectorImpl<Value *> &PointerOps) { 3871 // Check that a vectorized load would load the same memory as a scalar 3872 // load. For example, we don't want to vectorize loads that are smaller 3873 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 3874 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 3875 // from such a struct, we read/write packed bits disagreeing with the 3876 // unvectorized version. 3877 Type *ScalarTy = VL0->getType(); 3878 3879 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) 3880 return LoadsState::Gather; 3881 3882 // Make sure all loads in the bundle are simple - we can't vectorize 3883 // atomic or volatile loads. 3884 PointerOps.clear(); 3885 PointerOps.resize(VL.size()); 3886 auto *POIter = PointerOps.begin(); 3887 for (Value *V : VL) { 3888 auto *L = cast<LoadInst>(V); 3889 if (!L->isSimple()) 3890 return LoadsState::Gather; 3891 *POIter = L->getPointerOperand(); 3892 ++POIter; 3893 } 3894 3895 Order.clear(); 3896 // Check the order of pointer operands or that all pointers are the same. 3897 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); 3898 if (IsSorted || all_of(PointerOps, [&](Value *P) { 3899 return arePointersCompatible(P, PointerOps.front(), TLI); 3900 })) { 3901 if (IsSorted) { 3902 Value *Ptr0; 3903 Value *PtrN; 3904 if (Order.empty()) { 3905 Ptr0 = PointerOps.front(); 3906 PtrN = PointerOps.back(); 3907 } else { 3908 Ptr0 = PointerOps[Order.front()]; 3909 PtrN = PointerOps[Order.back()]; 3910 } 3911 std::optional<int> Diff = 3912 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); 3913 // Check that the sorted loads are consecutive. 3914 if (static_cast<unsigned>(*Diff) == VL.size() - 1) 3915 return LoadsState::Vectorize; 3916 } 3917 // TODO: need to improve analysis of the pointers, if not all of them are 3918 // GEPs or have > 2 operands, we end up with a gather node, which just 3919 // increases the cost. 3920 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent()); 3921 bool ProfitableGatherPointers = 3922 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) { 3923 return L && L->isLoopInvariant(V); 3924 })) <= VL.size() / 2 && VL.size() > 2; 3925 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { 3926 auto *GEP = dyn_cast<GetElementPtrInst>(P); 3927 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || 3928 (GEP && GEP->getNumOperands() == 2); 3929 })) { 3930 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); 3931 for (Value *V : VL) 3932 CommonAlignment = 3933 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); 3934 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 3935 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && 3936 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) 3937 return LoadsState::ScatterVectorize; 3938 } 3939 } 3940 3941 return LoadsState::Gather; 3942 } 3943 3944 static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, 3945 const DataLayout &DL, ScalarEvolution &SE, 3946 SmallVectorImpl<unsigned> &SortedIndices) { 3947 assert(llvm::all_of( 3948 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && 3949 "Expected list of pointer operands."); 3950 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each 3951 // Ptr into, sort and return the sorted indices with values next to one 3952 // another. 3953 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases; 3954 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); 3955 3956 unsigned Cnt = 1; 3957 for (Value *Ptr : VL.drop_front()) { 3958 bool Found = any_of(Bases, [&](auto &Base) { 3959 std::optional<int> Diff = 3960 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, 3961 /*StrictCheck=*/true); 3962 if (!Diff) 3963 return false; 3964 3965 Base.second.emplace_back(Ptr, *Diff, Cnt++); 3966 return true; 3967 }); 3968 3969 if (!Found) { 3970 // If we haven't found enough to usefully cluster, return early. 3971 if (Bases.size() > VL.size() / 2 - 1) 3972 return false; 3973 3974 // Not found already - add a new Base 3975 Bases[Ptr].emplace_back(Ptr, 0, Cnt++); 3976 } 3977 } 3978 3979 // For each of the bases sort the pointers by Offset and check if any of the 3980 // base become consecutively allocated. 3981 bool AnyConsecutive = false; 3982 for (auto &Base : Bases) { 3983 auto &Vec = Base.second; 3984 if (Vec.size() > 1) { 3985 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X, 3986 const std::tuple<Value *, int, unsigned> &Y) { 3987 return std::get<1>(X) < std::get<1>(Y); 3988 }); 3989 int InitialOffset = std::get<1>(Vec[0]); 3990 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { 3991 return std::get<1>(P.value()) == int(P.index()) + InitialOffset; 3992 }); 3993 } 3994 } 3995 3996 // Fill SortedIndices array only if it looks worth-while to sort the ptrs. 3997 SortedIndices.clear(); 3998 if (!AnyConsecutive) 3999 return false; 4000 4001 for (auto &Base : Bases) { 4002 for (auto &T : Base.second) 4003 SortedIndices.push_back(std::get<2>(T)); 4004 } 4005 4006 assert(SortedIndices.size() == VL.size() && 4007 "Expected SortedIndices to be the size of VL"); 4008 return true; 4009 } 4010 4011 std::optional<BoUpSLP::OrdersType> 4012 BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { 4013 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); 4014 Type *ScalarTy = TE.Scalars[0]->getType(); 4015 4016 SmallVector<Value *> Ptrs; 4017 Ptrs.reserve(TE.Scalars.size()); 4018 for (Value *V : TE.Scalars) { 4019 auto *L = dyn_cast<LoadInst>(V); 4020 if (!L || !L->isSimple()) 4021 return std::nullopt; 4022 Ptrs.push_back(L->getPointerOperand()); 4023 } 4024 4025 BoUpSLP::OrdersType Order; 4026 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) 4027 return std::move(Order); 4028 return std::nullopt; 4029 } 4030 4031 /// Check if two insertelement instructions are from the same buildvector. 4032 static bool areTwoInsertFromSameBuildVector( 4033 InsertElementInst *VU, InsertElementInst *V, 4034 function_ref<Value *(InsertElementInst *)> GetBaseOperand) { 4035 // Instructions must be from the same basic blocks. 4036 if (VU->getParent() != V->getParent()) 4037 return false; 4038 // Checks if 2 insertelements are from the same buildvector. 4039 if (VU->getType() != V->getType()) 4040 return false; 4041 // Multiple used inserts are separate nodes. 4042 if (!VU->hasOneUse() && !V->hasOneUse()) 4043 return false; 4044 auto *IE1 = VU; 4045 auto *IE2 = V; 4046 std::optional<unsigned> Idx1 = getInsertIndex(IE1); 4047 std::optional<unsigned> Idx2 = getInsertIndex(IE2); 4048 if (Idx1 == std::nullopt || Idx2 == std::nullopt) 4049 return false; 4050 // Go through the vector operand of insertelement instructions trying to find 4051 // either VU as the original vector for IE2 or V as the original vector for 4052 // IE1. 4053 SmallSet<int, 8> ReusedIdx; 4054 bool IsReusedIdx = false; 4055 do { 4056 if (IE2 == VU && !IE1) 4057 return VU->hasOneUse(); 4058 if (IE1 == V && !IE2) 4059 return V->hasOneUse(); 4060 if (IE1 && IE1 != V) { 4061 IsReusedIdx |= 4062 !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second; 4063 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) 4064 IE1 = nullptr; 4065 else 4066 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); 4067 } 4068 if (IE2 && IE2 != VU) { 4069 IsReusedIdx |= 4070 !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second; 4071 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) 4072 IE2 = nullptr; 4073 else 4074 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2)); 4075 } 4076 } while (!IsReusedIdx && (IE1 || IE2)); 4077 return false; 4078 } 4079 4080 std::optional<BoUpSLP::OrdersType> 4081 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { 4082 // No need to reorder if need to shuffle reuses, still need to shuffle the 4083 // node. 4084 if (!TE.ReuseShuffleIndices.empty()) { 4085 // Check if reuse shuffle indices can be improved by reordering. 4086 // For this, check that reuse mask is "clustered", i.e. each scalar values 4087 // is used once in each submask of size <number_of_scalars>. 4088 // Example: 4 scalar values. 4089 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. 4090 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because 4091 // element 3 is used twice in the second submask. 4092 unsigned Sz = TE.Scalars.size(); 4093 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 4094 Sz)) 4095 return std::nullopt; 4096 unsigned VF = TE.getVectorFactor(); 4097 // Try build correct order for extractelement instructions. 4098 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), 4099 TE.ReuseShuffleIndices.end()); 4100 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && 4101 all_of(TE.Scalars, [Sz](Value *V) { 4102 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V)); 4103 return Idx && *Idx < Sz; 4104 })) { 4105 SmallVector<int> ReorderMask(Sz, PoisonMaskElem); 4106 if (TE.ReorderIndices.empty()) 4107 std::iota(ReorderMask.begin(), ReorderMask.end(), 0); 4108 else 4109 inversePermutation(TE.ReorderIndices, ReorderMask); 4110 for (unsigned I = 0; I < VF; ++I) { 4111 int &Idx = ReusedMask[I]; 4112 if (Idx == PoisonMaskElem) 4113 continue; 4114 Value *V = TE.Scalars[ReorderMask[Idx]]; 4115 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V)); 4116 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); 4117 } 4118 } 4119 // Build the order of the VF size, need to reorder reuses shuffles, they are 4120 // always of VF size. 4121 OrdersType ResOrder(VF); 4122 std::iota(ResOrder.begin(), ResOrder.end(), 0); 4123 auto *It = ResOrder.begin(); 4124 for (unsigned K = 0; K < VF; K += Sz) { 4125 OrdersType CurrentOrder(TE.ReorderIndices); 4126 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; 4127 if (SubMask.front() == PoisonMaskElem) 4128 std::iota(SubMask.begin(), SubMask.end(), 0); 4129 reorderOrder(CurrentOrder, SubMask); 4130 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); 4131 std::advance(It, Sz); 4132 } 4133 if (all_of(enumerate(ResOrder), 4134 [](const auto &Data) { return Data.index() == Data.value(); })) 4135 return std::nullopt; // No need to reorder. 4136 return std::move(ResOrder); 4137 } 4138 if (TE.State == TreeEntry::Vectorize && 4139 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || 4140 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && 4141 !TE.isAltShuffle()) 4142 return TE.ReorderIndices; 4143 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { 4144 auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { 4145 if (V1 == V2) 4146 return false; 4147 if (!V1->hasOneUse() || !V2->hasOneUse()) 4148 return false; 4149 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin()); 4150 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin()); 4151 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1)) 4152 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) { 4153 if (!areTwoInsertFromSameBuildVector( 4154 IE1, IE2, 4155 [](InsertElementInst *II) { return II->getOperand(0); })) 4156 return false; 4157 std::optional<unsigned> Idx1 = getInsertIndex(IE1); 4158 std::optional<unsigned> Idx2 = getInsertIndex(IE2); 4159 if (Idx1 == std::nullopt || Idx2 == std::nullopt) 4160 return false; 4161 return *Idx1 < *Idx2; 4162 } 4163 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1)) 4164 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) { 4165 if (EE1->getOperand(0) != EE2->getOperand(0)) 4166 return false; 4167 std::optional<unsigned> Idx1 = getExtractIndex(EE1); 4168 std::optional<unsigned> Idx2 = getExtractIndex(EE2); 4169 if (Idx1 == std::nullopt || Idx2 == std::nullopt) 4170 return false; 4171 return *Idx1 < *Idx2; 4172 } 4173 return false; 4174 }; 4175 auto IsIdentityOrder = [](const OrdersType &Order) { 4176 for (unsigned Idx : seq<unsigned>(0, Order.size())) 4177 if (Idx != Order[Idx]) 4178 return false; 4179 return true; 4180 }; 4181 if (!TE.ReorderIndices.empty()) 4182 return TE.ReorderIndices; 4183 DenseMap<Value *, unsigned> PhiToId; 4184 SmallVector<Value *, 4> Phis; 4185 OrdersType ResOrder(TE.Scalars.size()); 4186 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) { 4187 PhiToId[TE.Scalars[Id]] = Id; 4188 Phis.push_back(TE.Scalars[Id]); 4189 } 4190 llvm::stable_sort(Phis, PHICompare); 4191 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) 4192 ResOrder[Id] = PhiToId[Phis[Id]]; 4193 if (IsIdentityOrder(ResOrder)) 4194 return std::nullopt; // No need to reorder. 4195 return std::move(ResOrder); 4196 } 4197 if (TE.State == TreeEntry::NeedToGather) { 4198 // TODO: add analysis of other gather nodes with extractelement 4199 // instructions and other values/instructions, not only undefs. 4200 if (((TE.getOpcode() == Instruction::ExtractElement && 4201 !TE.isAltShuffle()) || 4202 (all_of(TE.Scalars, 4203 [](Value *V) { 4204 return isa<UndefValue, ExtractElementInst>(V); 4205 }) && 4206 any_of(TE.Scalars, 4207 [](Value *V) { return isa<ExtractElementInst>(V); }))) && 4208 all_of(TE.Scalars, 4209 [](Value *V) { 4210 auto *EE = dyn_cast<ExtractElementInst>(V); 4211 return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); 4212 }) && 4213 allSameType(TE.Scalars)) { 4214 // Check that gather of extractelements can be represented as 4215 // just a shuffle of a single vector. 4216 OrdersType CurrentOrder; 4217 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); 4218 if (Reuse || !CurrentOrder.empty()) { 4219 if (!CurrentOrder.empty()) 4220 fixupOrderingIndices(CurrentOrder); 4221 return std::move(CurrentOrder); 4222 } 4223 } 4224 // If the gather node is <undef, v, .., poison> and 4225 // insertelement poison, v, 0 [+ permute] 4226 // is cheaper than 4227 // insertelement poison, v, n - try to reorder. 4228 // If rotating the whole graph, exclude the permute cost, the whole graph 4229 // might be transformed. 4230 int Sz = TE.Scalars.size(); 4231 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) && 4232 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) { 4233 const auto *It = 4234 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); }); 4235 if (It == TE.Scalars.begin()) 4236 return OrdersType(); 4237 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz); 4238 if (It != TE.Scalars.end()) { 4239 OrdersType Order(Sz, Sz); 4240 unsigned Idx = std::distance(TE.Scalars.begin(), It); 4241 Order[Idx] = 0; 4242 fixupOrderingIndices(Order); 4243 SmallVector<int> Mask; 4244 inversePermutation(Order, Mask); 4245 InstructionCost PermuteCost = 4246 TopToBottom 4247 ? 0 4248 : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask); 4249 InstructionCost InsertFirstCost = TTI->getVectorInstrCost( 4250 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0, 4251 PoisonValue::get(Ty), *It); 4252 InstructionCost InsertIdxCost = TTI->getVectorInstrCost( 4253 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, 4254 PoisonValue::get(Ty), *It); 4255 if (InsertFirstCost + PermuteCost < InsertIdxCost) 4256 return std::move(Order); 4257 } 4258 } 4259 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) 4260 return CurrentOrder; 4261 if (TE.Scalars.size() >= 4) 4262 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) 4263 return Order; 4264 } 4265 return std::nullopt; 4266 } 4267 4268 /// Checks if the given mask is a "clustered" mask with the same clusters of 4269 /// size \p Sz, which are not identity submasks. 4270 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, 4271 unsigned Sz) { 4272 ArrayRef<int> FirstCluster = Mask.slice(0, Sz); 4273 if (ShuffleVectorInst::isIdentityMask(FirstCluster)) 4274 return false; 4275 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { 4276 ArrayRef<int> Cluster = Mask.slice(I, Sz); 4277 if (Cluster != FirstCluster) 4278 return false; 4279 } 4280 return true; 4281 } 4282 4283 void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { 4284 // Reorder reuses mask. 4285 reorderReuses(TE.ReuseShuffleIndices, Mask); 4286 const unsigned Sz = TE.Scalars.size(); 4287 // For vectorized and non-clustered reused no need to do anything else. 4288 if (TE.State != TreeEntry::NeedToGather || 4289 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 4290 Sz) || 4291 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) 4292 return; 4293 SmallVector<int> NewMask; 4294 inversePermutation(TE.ReorderIndices, NewMask); 4295 addMask(NewMask, TE.ReuseShuffleIndices); 4296 // Clear reorder since it is going to be applied to the new mask. 4297 TE.ReorderIndices.clear(); 4298 // Try to improve gathered nodes with clustered reuses, if possible. 4299 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz); 4300 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end()); 4301 inversePermutation(NewOrder, NewMask); 4302 reorderScalars(TE.Scalars, NewMask); 4303 // Fill the reuses mask with the identity submasks. 4304 for (auto *It = TE.ReuseShuffleIndices.begin(), 4305 *End = TE.ReuseShuffleIndices.end(); 4306 It != End; std::advance(It, Sz)) 4307 std::iota(It, std::next(It, Sz), 0); 4308 } 4309 4310 void BoUpSLP::reorderTopToBottom() { 4311 // Maps VF to the graph nodes. 4312 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; 4313 // ExtractElement gather nodes which can be vectorized and need to handle 4314 // their ordering. 4315 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 4316 4317 // Phi nodes can have preferred ordering based on their result users 4318 DenseMap<const TreeEntry *, OrdersType> PhisToOrders; 4319 4320 // AltShuffles can also have a preferred ordering that leads to fewer 4321 // instructions, e.g., the addsub instruction in x86. 4322 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; 4323 4324 // Maps a TreeEntry to the reorder indices of external users. 4325 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>> 4326 ExternalUserReorderMap; 4327 // FIXME: Workaround for syntax error reported by MSVC buildbots. 4328 TargetTransformInfo &TTIRef = *TTI; 4329 // Find all reorderable nodes with the given VF. 4330 // Currently the are vectorized stores,loads,extracts + some gathering of 4331 // extracts. 4332 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, 4333 &GathersToOrders, &ExternalUserReorderMap, 4334 &AltShufflesToOrders, &PhisToOrders]( 4335 const std::unique_ptr<TreeEntry> &TE) { 4336 // Look for external users that will probably be vectorized. 4337 SmallVector<OrdersType, 1> ExternalUserReorderIndices = 4338 findExternalStoreUsersReorderIndices(TE.get()); 4339 if (!ExternalUserReorderIndices.empty()) { 4340 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 4341 ExternalUserReorderMap.try_emplace(TE.get(), 4342 std::move(ExternalUserReorderIndices)); 4343 } 4344 4345 // Patterns like [fadd,fsub] can be combined into a single instruction in 4346 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need 4347 // to take into account their order when looking for the most used order. 4348 if (TE->isAltShuffle()) { 4349 VectorType *VecTy = 4350 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); 4351 unsigned Opcode0 = TE->getOpcode(); 4352 unsigned Opcode1 = TE->getAltOpcode(); 4353 // The opcode mask selects between the two opcodes. 4354 SmallBitVector OpcodeMask(TE->Scalars.size(), false); 4355 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) 4356 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1) 4357 OpcodeMask.set(Lane); 4358 // If this pattern is supported by the target then we consider the order. 4359 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { 4360 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 4361 AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); 4362 } 4363 // TODO: Check the reverse order too. 4364 } 4365 4366 if (std::optional<OrdersType> CurrentOrder = 4367 getReorderingData(*TE, /*TopToBottom=*/true)) { 4368 // Do not include ordering for nodes used in the alt opcode vectorization, 4369 // better to reorder them during bottom-to-top stage. If follow the order 4370 // here, it causes reordering of the whole graph though actually it is 4371 // profitable just to reorder the subgraph that starts from the alternate 4372 // opcode vectorization node. Such nodes already end-up with the shuffle 4373 // instruction and it is just enough to change this shuffle rather than 4374 // rotate the scalars for the whole graph. 4375 unsigned Cnt = 0; 4376 const TreeEntry *UserTE = TE.get(); 4377 while (UserTE && Cnt < RecursionMaxDepth) { 4378 if (UserTE->UserTreeIndices.size() != 1) 4379 break; 4380 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { 4381 return EI.UserTE->State == TreeEntry::Vectorize && 4382 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; 4383 })) 4384 return; 4385 UserTE = UserTE->UserTreeIndices.back().UserTE; 4386 ++Cnt; 4387 } 4388 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 4389 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) 4390 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 4391 if (TE->State == TreeEntry::Vectorize && 4392 TE->getOpcode() == Instruction::PHI) 4393 PhisToOrders.try_emplace(TE.get(), *CurrentOrder); 4394 } 4395 }); 4396 4397 // Reorder the graph nodes according to their vectorization factor. 4398 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; 4399 VF /= 2) { 4400 auto It = VFToOrderedEntries.find(VF); 4401 if (It == VFToOrderedEntries.end()) 4402 continue; 4403 // Try to find the most profitable order. We just are looking for the most 4404 // used order and reorder scalar elements in the nodes according to this 4405 // mostly used order. 4406 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); 4407 // All operands are reordered and used only in this node - propagate the 4408 // most used order to the user node. 4409 MapVector<OrdersType, unsigned, 4410 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 4411 OrdersUses; 4412 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 4413 for (const TreeEntry *OpTE : OrderedEntries) { 4414 // No need to reorder this nodes, still need to extend and to use shuffle, 4415 // just need to merge reordering shuffle and the reuse shuffle. 4416 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 4417 continue; 4418 // Count number of orders uses. 4419 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, 4420 &PhisToOrders]() -> const OrdersType & { 4421 if (OpTE->State == TreeEntry::NeedToGather || 4422 !OpTE->ReuseShuffleIndices.empty()) { 4423 auto It = GathersToOrders.find(OpTE); 4424 if (It != GathersToOrders.end()) 4425 return It->second; 4426 } 4427 if (OpTE->isAltShuffle()) { 4428 auto It = AltShufflesToOrders.find(OpTE); 4429 if (It != AltShufflesToOrders.end()) 4430 return It->second; 4431 } 4432 if (OpTE->State == TreeEntry::Vectorize && 4433 OpTE->getOpcode() == Instruction::PHI) { 4434 auto It = PhisToOrders.find(OpTE); 4435 if (It != PhisToOrders.end()) 4436 return It->second; 4437 } 4438 return OpTE->ReorderIndices; 4439 }(); 4440 // First consider the order of the external scalar users. 4441 auto It = ExternalUserReorderMap.find(OpTE); 4442 if (It != ExternalUserReorderMap.end()) { 4443 const auto &ExternalUserReorderIndices = It->second; 4444 // If the OpTE vector factor != number of scalars - use natural order, 4445 // it is an attempt to reorder node with reused scalars but with 4446 // external uses. 4447 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { 4448 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += 4449 ExternalUserReorderIndices.size(); 4450 } else { 4451 for (const OrdersType &ExtOrder : ExternalUserReorderIndices) 4452 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; 4453 } 4454 // No other useful reorder data in this entry. 4455 if (Order.empty()) 4456 continue; 4457 } 4458 // Stores actually store the mask, not the order, need to invert. 4459 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 4460 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 4461 SmallVector<int> Mask; 4462 inversePermutation(Order, Mask); 4463 unsigned E = Order.size(); 4464 OrdersType CurrentOrder(E, E); 4465 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 4466 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 4467 }); 4468 fixupOrderingIndices(CurrentOrder); 4469 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; 4470 } else { 4471 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; 4472 } 4473 } 4474 // Set order of the user node. 4475 if (OrdersUses.empty()) 4476 continue; 4477 // Choose the most used order. 4478 ArrayRef<unsigned> BestOrder = OrdersUses.front().first; 4479 unsigned Cnt = OrdersUses.front().second; 4480 for (const auto &Pair : drop_begin(OrdersUses)) { 4481 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { 4482 BestOrder = Pair.first; 4483 Cnt = Pair.second; 4484 } 4485 } 4486 // Set order of the user node. 4487 if (BestOrder.empty()) 4488 continue; 4489 SmallVector<int> Mask; 4490 inversePermutation(BestOrder, Mask); 4491 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 4492 unsigned E = BestOrder.size(); 4493 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 4494 return I < E ? static_cast<int>(I) : PoisonMaskElem; 4495 }); 4496 // Do an actual reordering, if profitable. 4497 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 4498 // Just do the reordering for the nodes with the given VF. 4499 if (TE->Scalars.size() != VF) { 4500 if (TE->ReuseShuffleIndices.size() == VF) { 4501 // Need to reorder the reuses masks of the operands with smaller VF to 4502 // be able to find the match between the graph nodes and scalar 4503 // operands of the given node during vectorization/cost estimation. 4504 assert(all_of(TE->UserTreeIndices, 4505 [VF, &TE](const EdgeInfo &EI) { 4506 return EI.UserTE->Scalars.size() == VF || 4507 EI.UserTE->Scalars.size() == 4508 TE->Scalars.size(); 4509 }) && 4510 "All users must be of VF size."); 4511 // Update ordering of the operands with the smaller VF than the given 4512 // one. 4513 reorderNodeWithReuses(*TE, Mask); 4514 } 4515 continue; 4516 } 4517 if (TE->State == TreeEntry::Vectorize && 4518 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, 4519 InsertElementInst>(TE->getMainOp()) && 4520 !TE->isAltShuffle()) { 4521 // Build correct orders for extract{element,value}, loads and 4522 // stores. 4523 reorderOrder(TE->ReorderIndices, Mask); 4524 if (isa<InsertElementInst, StoreInst>(TE->getMainOp())) 4525 TE->reorderOperands(Mask); 4526 } else { 4527 // Reorder the node and its operands. 4528 TE->reorderOperands(Mask); 4529 assert(TE->ReorderIndices.empty() && 4530 "Expected empty reorder sequence."); 4531 reorderScalars(TE->Scalars, Mask); 4532 } 4533 if (!TE->ReuseShuffleIndices.empty()) { 4534 // Apply reversed order to keep the original ordering of the reused 4535 // elements to avoid extra reorder indices shuffling. 4536 OrdersType CurrentOrder; 4537 reorderOrder(CurrentOrder, MaskOrder); 4538 SmallVector<int> NewReuses; 4539 inversePermutation(CurrentOrder, NewReuses); 4540 addMask(NewReuses, TE->ReuseShuffleIndices); 4541 TE->ReuseShuffleIndices.swap(NewReuses); 4542 } 4543 } 4544 } 4545 } 4546 4547 bool BoUpSLP::canReorderOperands( 4548 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 4549 ArrayRef<TreeEntry *> ReorderableGathers, 4550 SmallVectorImpl<TreeEntry *> &GatherOps) { 4551 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { 4552 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { 4553 return OpData.first == I && 4554 OpData.second->State == TreeEntry::Vectorize; 4555 })) 4556 continue; 4557 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { 4558 // Do not reorder if operand node is used by many user nodes. 4559 if (any_of(TE->UserTreeIndices, 4560 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) 4561 return false; 4562 // Add the node to the list of the ordered nodes with the identity 4563 // order. 4564 Edges.emplace_back(I, TE); 4565 // Add ScatterVectorize nodes to the list of operands, where just 4566 // reordering of the scalars is required. Similar to the gathers, so 4567 // simply add to the list of gathered ops. 4568 // If there are reused scalars, process this node as a regular vectorize 4569 // node, just reorder reuses mask. 4570 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty()) 4571 GatherOps.push_back(TE); 4572 continue; 4573 } 4574 TreeEntry *Gather = nullptr; 4575 if (count_if(ReorderableGathers, 4576 [&Gather, UserTE, I](TreeEntry *TE) { 4577 assert(TE->State != TreeEntry::Vectorize && 4578 "Only non-vectorized nodes are expected."); 4579 if (any_of(TE->UserTreeIndices, 4580 [UserTE, I](const EdgeInfo &EI) { 4581 return EI.UserTE == UserTE && EI.EdgeIdx == I; 4582 })) { 4583 assert(TE->isSame(UserTE->getOperand(I)) && 4584 "Operand entry does not match operands."); 4585 Gather = TE; 4586 return true; 4587 } 4588 return false; 4589 }) > 1 && 4590 !allConstant(UserTE->getOperand(I))) 4591 return false; 4592 if (Gather) 4593 GatherOps.push_back(Gather); 4594 } 4595 return true; 4596 } 4597 4598 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { 4599 SetVector<TreeEntry *> OrderedEntries; 4600 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 4601 // Find all reorderable leaf nodes with the given VF. 4602 // Currently the are vectorized loads,extracts without alternate operands + 4603 // some gathering of extracts. 4604 SmallVector<TreeEntry *> NonVectorized; 4605 for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, 4606 &NonVectorized]( 4607 const std::unique_ptr<TreeEntry> &TE) { 4608 if (TE->State != TreeEntry::Vectorize) 4609 NonVectorized.push_back(TE.get()); 4610 if (std::optional<OrdersType> CurrentOrder = 4611 getReorderingData(*TE, /*TopToBottom=*/false)) { 4612 OrderedEntries.insert(TE.get()); 4613 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) 4614 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 4615 } 4616 }); 4617 4618 // 1. Propagate order to the graph nodes, which use only reordered nodes. 4619 // I.e., if the node has operands, that are reordered, try to make at least 4620 // one operand order in the natural order and reorder others + reorder the 4621 // user node itself. 4622 SmallPtrSet<const TreeEntry *, 4> Visited; 4623 while (!OrderedEntries.empty()) { 4624 // 1. Filter out only reordered nodes. 4625 // 2. If the entry has multiple uses - skip it and jump to the next node. 4626 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; 4627 SmallVector<TreeEntry *> Filtered; 4628 for (TreeEntry *TE : OrderedEntries) { 4629 if (!(TE->State == TreeEntry::Vectorize || 4630 (TE->State == TreeEntry::NeedToGather && 4631 GathersToOrders.count(TE))) || 4632 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || 4633 !all_of(drop_begin(TE->UserTreeIndices), 4634 [TE](const EdgeInfo &EI) { 4635 return EI.UserTE == TE->UserTreeIndices.front().UserTE; 4636 }) || 4637 !Visited.insert(TE).second) { 4638 Filtered.push_back(TE); 4639 continue; 4640 } 4641 // Build a map between user nodes and their operands order to speedup 4642 // search. The graph currently does not provide this dependency directly. 4643 for (EdgeInfo &EI : TE->UserTreeIndices) { 4644 TreeEntry *UserTE = EI.UserTE; 4645 auto It = Users.find(UserTE); 4646 if (It == Users.end()) 4647 It = Users.insert({UserTE, {}}).first; 4648 It->second.emplace_back(EI.EdgeIdx, TE); 4649 } 4650 } 4651 // Erase filtered entries. 4652 for_each(Filtered, 4653 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); 4654 SmallVector< 4655 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>> 4656 UsersVec(Users.begin(), Users.end()); 4657 sort(UsersVec, [](const auto &Data1, const auto &Data2) { 4658 return Data1.first->Idx > Data2.first->Idx; 4659 }); 4660 for (auto &Data : UsersVec) { 4661 // Check that operands are used only in the User node. 4662 SmallVector<TreeEntry *> GatherOps; 4663 if (!canReorderOperands(Data.first, Data.second, NonVectorized, 4664 GatherOps)) { 4665 for_each(Data.second, 4666 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 4667 OrderedEntries.remove(Op.second); 4668 }); 4669 continue; 4670 } 4671 // All operands are reordered and used only in this node - propagate the 4672 // most used order to the user node. 4673 MapVector<OrdersType, unsigned, 4674 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 4675 OrdersUses; 4676 // Do the analysis for each tree entry only once, otherwise the order of 4677 // the same node my be considered several times, though might be not 4678 // profitable. 4679 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 4680 SmallPtrSet<const TreeEntry *, 4> VisitedUsers; 4681 for (const auto &Op : Data.second) { 4682 TreeEntry *OpTE = Op.second; 4683 if (!VisitedOps.insert(OpTE).second) 4684 continue; 4685 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 4686 continue; 4687 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { 4688 if (OpTE->State == TreeEntry::NeedToGather || 4689 !OpTE->ReuseShuffleIndices.empty()) 4690 return GathersToOrders.find(OpTE)->second; 4691 return OpTE->ReorderIndices; 4692 }(); 4693 unsigned NumOps = count_if( 4694 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { 4695 return P.second == OpTE; 4696 }); 4697 // Stores actually store the mask, not the order, need to invert. 4698 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 4699 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 4700 SmallVector<int> Mask; 4701 inversePermutation(Order, Mask); 4702 unsigned E = Order.size(); 4703 OrdersType CurrentOrder(E, E); 4704 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 4705 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 4706 }); 4707 fixupOrderingIndices(CurrentOrder); 4708 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += 4709 NumOps; 4710 } else { 4711 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; 4712 } 4713 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); 4714 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( 4715 const TreeEntry *TE) { 4716 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || 4717 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || 4718 (IgnoreReorder && TE->Idx == 0)) 4719 return true; 4720 if (TE->State == TreeEntry::NeedToGather) { 4721 auto It = GathersToOrders.find(TE); 4722 if (It != GathersToOrders.end()) 4723 return !It->second.empty(); 4724 return true; 4725 } 4726 return false; 4727 }; 4728 for (const EdgeInfo &EI : OpTE->UserTreeIndices) { 4729 TreeEntry *UserTE = EI.UserTE; 4730 if (!VisitedUsers.insert(UserTE).second) 4731 continue; 4732 // May reorder user node if it requires reordering, has reused 4733 // scalars, is an alternate op vectorize node or its op nodes require 4734 // reordering. 4735 if (AllowsReordering(UserTE)) 4736 continue; 4737 // Check if users allow reordering. 4738 // Currently look up just 1 level of operands to avoid increase of 4739 // the compile time. 4740 // Profitable to reorder if definitely more operands allow 4741 // reordering rather than those with natural order. 4742 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE]; 4743 if (static_cast<unsigned>(count_if( 4744 Ops, [UserTE, &AllowsReordering]( 4745 const std::pair<unsigned, TreeEntry *> &Op) { 4746 return AllowsReordering(Op.second) && 4747 all_of(Op.second->UserTreeIndices, 4748 [UserTE](const EdgeInfo &EI) { 4749 return EI.UserTE == UserTE; 4750 }); 4751 })) <= Ops.size() / 2) 4752 ++Res.first->second; 4753 } 4754 } 4755 // If no orders - skip current nodes and jump to the next one, if any. 4756 if (OrdersUses.empty()) { 4757 for_each(Data.second, 4758 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 4759 OrderedEntries.remove(Op.second); 4760 }); 4761 continue; 4762 } 4763 // Choose the best order. 4764 ArrayRef<unsigned> BestOrder = OrdersUses.front().first; 4765 unsigned Cnt = OrdersUses.front().second; 4766 for (const auto &Pair : drop_begin(OrdersUses)) { 4767 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { 4768 BestOrder = Pair.first; 4769 Cnt = Pair.second; 4770 } 4771 } 4772 // Set order of the user node (reordering of operands and user nodes). 4773 if (BestOrder.empty()) { 4774 for_each(Data.second, 4775 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 4776 OrderedEntries.remove(Op.second); 4777 }); 4778 continue; 4779 } 4780 // Erase operands from OrderedEntries list and adjust their orders. 4781 VisitedOps.clear(); 4782 SmallVector<int> Mask; 4783 inversePermutation(BestOrder, Mask); 4784 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 4785 unsigned E = BestOrder.size(); 4786 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 4787 return I < E ? static_cast<int>(I) : PoisonMaskElem; 4788 }); 4789 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { 4790 TreeEntry *TE = Op.second; 4791 OrderedEntries.remove(TE); 4792 if (!VisitedOps.insert(TE).second) 4793 continue; 4794 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { 4795 reorderNodeWithReuses(*TE, Mask); 4796 continue; 4797 } 4798 // Gathers are processed separately. 4799 if (TE->State != TreeEntry::Vectorize) 4800 continue; 4801 assert((BestOrder.size() == TE->ReorderIndices.size() || 4802 TE->ReorderIndices.empty()) && 4803 "Non-matching sizes of user/operand entries."); 4804 reorderOrder(TE->ReorderIndices, Mask); 4805 if (IgnoreReorder && TE == VectorizableTree.front().get()) 4806 IgnoreReorder = false; 4807 } 4808 // For gathers just need to reorder its scalars. 4809 for (TreeEntry *Gather : GatherOps) { 4810 assert(Gather->ReorderIndices.empty() && 4811 "Unexpected reordering of gathers."); 4812 if (!Gather->ReuseShuffleIndices.empty()) { 4813 // Just reorder reuses indices. 4814 reorderReuses(Gather->ReuseShuffleIndices, Mask); 4815 continue; 4816 } 4817 reorderScalars(Gather->Scalars, Mask); 4818 OrderedEntries.remove(Gather); 4819 } 4820 // Reorder operands of the user node and set the ordering for the user 4821 // node itself. 4822 if (Data.first->State != TreeEntry::Vectorize || 4823 !isa<ExtractElementInst, ExtractValueInst, LoadInst>( 4824 Data.first->getMainOp()) || 4825 Data.first->isAltShuffle()) 4826 Data.first->reorderOperands(Mask); 4827 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || 4828 Data.first->isAltShuffle()) { 4829 reorderScalars(Data.first->Scalars, Mask); 4830 reorderOrder(Data.first->ReorderIndices, MaskOrder); 4831 if (Data.first->ReuseShuffleIndices.empty() && 4832 !Data.first->ReorderIndices.empty() && 4833 !Data.first->isAltShuffle()) { 4834 // Insert user node to the list to try to sink reordering deeper in 4835 // the graph. 4836 OrderedEntries.insert(Data.first); 4837 } 4838 } else { 4839 reorderOrder(Data.first->ReorderIndices, Mask); 4840 } 4841 } 4842 } 4843 // If the reordering is unnecessary, just remove the reorder. 4844 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && 4845 VectorizableTree.front()->ReuseShuffleIndices.empty()) 4846 VectorizableTree.front()->ReorderIndices.clear(); 4847 } 4848 4849 void BoUpSLP::buildExternalUses( 4850 const ExtraValueToDebugLocsMap &ExternallyUsedValues) { 4851 // Collect the values that we need to extract from the tree. 4852 for (auto &TEPtr : VectorizableTree) { 4853 TreeEntry *Entry = TEPtr.get(); 4854 4855 // No need to handle users of gathered values. 4856 if (Entry->State == TreeEntry::NeedToGather) 4857 continue; 4858 4859 // For each lane: 4860 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 4861 Value *Scalar = Entry->Scalars[Lane]; 4862 int FoundLane = Entry->findLaneForValue(Scalar); 4863 4864 // Check if the scalar is externally used as an extra arg. 4865 auto ExtI = ExternallyUsedValues.find(Scalar); 4866 if (ExtI != ExternallyUsedValues.end()) { 4867 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " 4868 << Lane << " from " << *Scalar << ".\n"); 4869 ExternalUses.emplace_back(Scalar, nullptr, FoundLane); 4870 } 4871 for (User *U : Scalar->users()) { 4872 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 4873 4874 Instruction *UserInst = dyn_cast<Instruction>(U); 4875 if (!UserInst) 4876 continue; 4877 4878 if (isDeleted(UserInst)) 4879 continue; 4880 4881 // Skip in-tree scalars that become vectors 4882 if (TreeEntry *UseEntry = getTreeEntry(U)) { 4883 Value *UseScalar = UseEntry->Scalars[0]; 4884 // Some in-tree scalars will remain as scalar in vectorized 4885 // instructions. If that is the case, the one in Lane 0 will 4886 // be used. 4887 if (UseScalar != U || 4888 UseEntry->State == TreeEntry::ScatterVectorize || 4889 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 4890 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 4891 << ".\n"); 4892 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); 4893 continue; 4894 } 4895 } 4896 4897 // Ignore users in the user ignore list. 4898 if (UserIgnoreList && UserIgnoreList->contains(UserInst)) 4899 continue; 4900 4901 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " 4902 << Lane << " from " << *Scalar << ".\n"); 4903 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); 4904 } 4905 } 4906 } 4907 } 4908 4909 DenseMap<Value *, SmallVector<StoreInst *, 4>> 4910 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { 4911 DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap; 4912 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) { 4913 Value *V = TE->Scalars[Lane]; 4914 // To save compilation time we don't visit if we have too many users. 4915 static constexpr unsigned UsersLimit = 4; 4916 if (V->hasNUsesOrMore(UsersLimit)) 4917 break; 4918 4919 // Collect stores per pointer object. 4920 for (User *U : V->users()) { 4921 auto *SI = dyn_cast<StoreInst>(U); 4922 if (SI == nullptr || !SI->isSimple() || 4923 !isValidElementType(SI->getValueOperand()->getType())) 4924 continue; 4925 // Skip entry if already 4926 if (getTreeEntry(U)) 4927 continue; 4928 4929 Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 4930 auto &StoresVec = PtrToStoresMap[Ptr]; 4931 // For now just keep one store per pointer object per lane. 4932 // TODO: Extend this to support multiple stores per pointer per lane 4933 if (StoresVec.size() > Lane) 4934 continue; 4935 // Skip if in different BBs. 4936 if (!StoresVec.empty() && 4937 SI->getParent() != StoresVec.back()->getParent()) 4938 continue; 4939 // Make sure that the stores are of the same type. 4940 if (!StoresVec.empty() && 4941 SI->getValueOperand()->getType() != 4942 StoresVec.back()->getValueOperand()->getType()) 4943 continue; 4944 StoresVec.push_back(SI); 4945 } 4946 } 4947 return PtrToStoresMap; 4948 } 4949 4950 bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, 4951 OrdersType &ReorderIndices) const { 4952 // We check whether the stores in StoreVec can form a vector by sorting them 4953 // and checking whether they are consecutive. 4954 4955 // To avoid calling getPointersDiff() while sorting we create a vector of 4956 // pairs {store, offset from first} and sort this instead. 4957 SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size()); 4958 StoreInst *S0 = StoresVec[0]; 4959 StoreOffsetVec[0] = {S0, 0}; 4960 Type *S0Ty = S0->getValueOperand()->getType(); 4961 Value *S0Ptr = S0->getPointerOperand(); 4962 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) { 4963 StoreInst *SI = StoresVec[Idx]; 4964 std::optional<int> Diff = 4965 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), 4966 SI->getPointerOperand(), *DL, *SE, 4967 /*StrictCheck=*/true); 4968 // We failed to compare the pointers so just abandon this StoresVec. 4969 if (!Diff) 4970 return false; 4971 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; 4972 } 4973 4974 // Sort the vector based on the pointers. We create a copy because we may 4975 // need the original later for calculating the reorder (shuffle) indices. 4976 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1, 4977 const std::pair<StoreInst *, int> &Pair2) { 4978 int Offset1 = Pair1.second; 4979 int Offset2 = Pair2.second; 4980 return Offset1 < Offset2; 4981 }); 4982 4983 // Check if the stores are consecutive by checking if their difference is 1. 4984 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size())) 4985 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) 4986 return false; 4987 4988 // Calculate the shuffle indices according to their offset against the sorted 4989 // StoreOffsetVec. 4990 ReorderIndices.reserve(StoresVec.size()); 4991 for (StoreInst *SI : StoresVec) { 4992 unsigned Idx = find_if(StoreOffsetVec, 4993 [SI](const std::pair<StoreInst *, int> &Pair) { 4994 return Pair.first == SI; 4995 }) - 4996 StoreOffsetVec.begin(); 4997 ReorderIndices.push_back(Idx); 4998 } 4999 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in 5000 // reorderTopToBottom() and reorderBottomToTop(), so we are following the 5001 // same convention here. 5002 auto IsIdentityOrder = [](const OrdersType &Order) { 5003 for (unsigned Idx : seq<unsigned>(0, Order.size())) 5004 if (Idx != Order[Idx]) 5005 return false; 5006 return true; 5007 }; 5008 if (IsIdentityOrder(ReorderIndices)) 5009 ReorderIndices.clear(); 5010 5011 return true; 5012 } 5013 5014 #ifndef NDEBUG 5015 LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) { 5016 for (unsigned Idx : Order) 5017 dbgs() << Idx << ", "; 5018 dbgs() << "\n"; 5019 } 5020 #endif 5021 5022 SmallVector<BoUpSLP::OrdersType, 1> 5023 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { 5024 unsigned NumLanes = TE->Scalars.size(); 5025 5026 DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap = 5027 collectUserStores(TE); 5028 5029 // Holds the reorder indices for each candidate store vector that is a user of 5030 // the current TreeEntry. 5031 SmallVector<OrdersType, 1> ExternalReorderIndices; 5032 5033 // Now inspect the stores collected per pointer and look for vectorization 5034 // candidates. For each candidate calculate the reorder index vector and push 5035 // it into `ExternalReorderIndices` 5036 for (const auto &Pair : PtrToStoresMap) { 5037 auto &StoresVec = Pair.second; 5038 // If we have fewer than NumLanes stores, then we can't form a vector. 5039 if (StoresVec.size() != NumLanes) 5040 continue; 5041 5042 // If the stores are not consecutive then abandon this StoresVec. 5043 OrdersType ReorderIndices; 5044 if (!canFormVector(StoresVec, ReorderIndices)) 5045 continue; 5046 5047 // We now know that the scalars in StoresVec can form a vector instruction, 5048 // so set the reorder indices. 5049 ExternalReorderIndices.push_back(ReorderIndices); 5050 } 5051 return ExternalReorderIndices; 5052 } 5053 5054 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 5055 const SmallDenseSet<Value *> &UserIgnoreLst) { 5056 deleteTree(); 5057 UserIgnoreList = &UserIgnoreLst; 5058 if (!allSameType(Roots)) 5059 return; 5060 buildTree_rec(Roots, 0, EdgeInfo()); 5061 } 5062 5063 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { 5064 deleteTree(); 5065 if (!allSameType(Roots)) 5066 return; 5067 buildTree_rec(Roots, 0, EdgeInfo()); 5068 } 5069 5070 /// \return true if the specified list of values has only one instruction that 5071 /// requires scheduling, false otherwise. 5072 #ifndef NDEBUG 5073 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { 5074 Value *NeedsScheduling = nullptr; 5075 for (Value *V : VL) { 5076 if (doesNotNeedToBeScheduled(V)) 5077 continue; 5078 if (!NeedsScheduling) { 5079 NeedsScheduling = V; 5080 continue; 5081 } 5082 return false; 5083 } 5084 return NeedsScheduling; 5085 } 5086 #endif 5087 5088 /// Generates key/subkey pair for the given value to provide effective sorting 5089 /// of the values and better detection of the vectorizable values sequences. The 5090 /// keys/subkeys can be used for better sorting of the values themselves (keys) 5091 /// and in values subgroups (subkeys). 5092 static std::pair<size_t, size_t> generateKeySubkey( 5093 Value *V, const TargetLibraryInfo *TLI, 5094 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, 5095 bool AllowAlternate) { 5096 hash_code Key = hash_value(V->getValueID() + 2); 5097 hash_code SubKey = hash_value(0); 5098 // Sort the loads by the distance between the pointers. 5099 if (auto *LI = dyn_cast<LoadInst>(V)) { 5100 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key); 5101 if (LI->isSimple()) 5102 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); 5103 else 5104 Key = SubKey = hash_value(LI); 5105 } else if (isVectorLikeInstWithConstOps(V)) { 5106 // Sort extracts by the vector operands. 5107 if (isa<ExtractElementInst, UndefValue>(V)) 5108 Key = hash_value(Value::UndefValueVal + 1); 5109 if (auto *EI = dyn_cast<ExtractElementInst>(V)) { 5110 if (!isUndefVector(EI->getVectorOperand()).all() && 5111 !isa<UndefValue>(EI->getIndexOperand())) 5112 SubKey = hash_value(EI->getVectorOperand()); 5113 } 5114 } else if (auto *I = dyn_cast<Instruction>(V)) { 5115 // Sort other instructions just by the opcodes except for CMPInst. 5116 // For CMP also sort by the predicate kind. 5117 if ((isa<BinaryOperator, CastInst>(I)) && 5118 isValidForAlternation(I->getOpcode())) { 5119 if (AllowAlternate) 5120 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); 5121 else 5122 Key = hash_combine(hash_value(I->getOpcode()), Key); 5123 SubKey = hash_combine( 5124 hash_value(I->getOpcode()), hash_value(I->getType()), 5125 hash_value(isa<BinaryOperator>(I) 5126 ? I->getType() 5127 : cast<CastInst>(I)->getOperand(0)->getType())); 5128 // For casts, look through the only operand to improve compile time. 5129 if (isa<CastInst>(I)) { 5130 std::pair<size_t, size_t> OpVals = 5131 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator, 5132 /*AllowAlternate=*/true); 5133 Key = hash_combine(OpVals.first, Key); 5134 SubKey = hash_combine(OpVals.first, SubKey); 5135 } 5136 } else if (auto *CI = dyn_cast<CmpInst>(I)) { 5137 CmpInst::Predicate Pred = CI->getPredicate(); 5138 if (CI->isCommutative()) 5139 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred)); 5140 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); 5141 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred), 5142 hash_value(SwapPred), 5143 hash_value(CI->getOperand(0)->getType())); 5144 } else if (auto *Call = dyn_cast<CallInst>(I)) { 5145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); 5146 if (isTriviallyVectorizable(ID)) { 5147 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); 5148 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) { 5149 SubKey = hash_combine(hash_value(I->getOpcode()), 5150 hash_value(Call->getCalledFunction())); 5151 } else { 5152 Key = hash_combine(hash_value(Call), Key); 5153 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); 5154 } 5155 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) 5156 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), 5157 hash_value(Op.Tag), SubKey); 5158 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 5159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1))) 5160 SubKey = hash_value(Gep->getPointerOperand()); 5161 else 5162 SubKey = hash_value(Gep); 5163 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && 5164 !isa<ConstantInt>(I->getOperand(1))) { 5165 // Do not try to vectorize instructions with potentially high cost. 5166 SubKey = hash_value(I); 5167 } else { 5168 SubKey = hash_value(I->getOpcode()); 5169 } 5170 Key = hash_combine(hash_value(I->getParent()), Key); 5171 } 5172 return std::make_pair(Key, SubKey); 5173 } 5174 5175 /// Checks if the specified instruction \p I is an alternate operation for 5176 /// the given \p MainOp and \p AltOp instructions. 5177 static bool isAlternateInstruction(const Instruction *I, 5178 const Instruction *MainOp, 5179 const Instruction *AltOp, 5180 const TargetLibraryInfo &TLI); 5181 5182 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( 5183 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, 5184 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const { 5185 assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); 5186 5187 unsigned ShuffleOrOp = 5188 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); 5189 auto *VL0 = cast<Instruction>(S.OpValue); 5190 switch (ShuffleOrOp) { 5191 case Instruction::PHI: { 5192 // Check for terminator values (e.g. invoke). 5193 for (Value *V : VL) 5194 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { 5195 Instruction *Term = dyn_cast<Instruction>(Incoming); 5196 if (Term && Term->isTerminator()) { 5197 LLVM_DEBUG(dbgs() 5198 << "SLP: Need to swizzle PHINodes (terminator use).\n"); 5199 return TreeEntry::NeedToGather; 5200 } 5201 } 5202 5203 return TreeEntry::Vectorize; 5204 } 5205 case Instruction::ExtractValue: 5206 case Instruction::ExtractElement: { 5207 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); 5208 if (Reuse || !CurrentOrder.empty()) 5209 return TreeEntry::Vectorize; 5210 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); 5211 return TreeEntry::NeedToGather; 5212 } 5213 case Instruction::InsertElement: { 5214 // Check that we have a buildvector and not a shuffle of 2 or more 5215 // different vectors. 5216 ValueSet SourceVectors; 5217 for (Value *V : VL) { 5218 SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); 5219 assert(getInsertIndex(V) != std::nullopt && 5220 "Non-constant or undef index?"); 5221 } 5222 5223 if (count_if(VL, [&SourceVectors](Value *V) { 5224 return !SourceVectors.contains(V); 5225 }) >= 2) { 5226 // Found 2nd source vector - cancel. 5227 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " 5228 "different source vectors.\n"); 5229 return TreeEntry::NeedToGather; 5230 } 5231 5232 return TreeEntry::Vectorize; 5233 } 5234 case Instruction::Load: { 5235 // Check that a vectorized load would load the same memory as a scalar 5236 // load. For example, we don't want to vectorize loads that are smaller 5237 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 5238 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 5239 // from such a struct, we read/write packed bits disagreeing with the 5240 // unvectorized version. 5241 switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder, 5242 PointerOps)) { 5243 case LoadsState::Vectorize: 5244 return TreeEntry::Vectorize; 5245 case LoadsState::ScatterVectorize: 5246 return TreeEntry::ScatterVectorize; 5247 case LoadsState::Gather: 5248 #ifndef NDEBUG 5249 Type *ScalarTy = VL0->getType(); 5250 if (DL->getTypeSizeInBits(ScalarTy) != 5251 DL->getTypeAllocSizeInBits(ScalarTy)) 5252 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 5253 else if (any_of(VL, 5254 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); })) 5255 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 5256 else 5257 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 5258 #endif // NDEBUG 5259 return TreeEntry::NeedToGather; 5260 } 5261 llvm_unreachable("Unexpected state of loads"); 5262 } 5263 case Instruction::ZExt: 5264 case Instruction::SExt: 5265 case Instruction::FPToUI: 5266 case Instruction::FPToSI: 5267 case Instruction::FPExt: 5268 case Instruction::PtrToInt: 5269 case Instruction::IntToPtr: 5270 case Instruction::SIToFP: 5271 case Instruction::UIToFP: 5272 case Instruction::Trunc: 5273 case Instruction::FPTrunc: 5274 case Instruction::BitCast: { 5275 Type *SrcTy = VL0->getOperand(0)->getType(); 5276 for (Value *V : VL) { 5277 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); 5278 if (Ty != SrcTy || !isValidElementType(Ty)) { 5279 LLVM_DEBUG( 5280 dbgs() << "SLP: Gathering casts with different src types.\n"); 5281 return TreeEntry::NeedToGather; 5282 } 5283 } 5284 return TreeEntry::Vectorize; 5285 } 5286 case Instruction::ICmp: 5287 case Instruction::FCmp: { 5288 // Check that all of the compares have the same predicate. 5289 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 5290 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); 5291 Type *ComparedTy = VL0->getOperand(0)->getType(); 5292 for (Value *V : VL) { 5293 CmpInst *Cmp = cast<CmpInst>(V); 5294 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || 5295 Cmp->getOperand(0)->getType() != ComparedTy) { 5296 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 5297 return TreeEntry::NeedToGather; 5298 } 5299 } 5300 return TreeEntry::Vectorize; 5301 } 5302 case Instruction::Select: 5303 case Instruction::FNeg: 5304 case Instruction::Add: 5305 case Instruction::FAdd: 5306 case Instruction::Sub: 5307 case Instruction::FSub: 5308 case Instruction::Mul: 5309 case Instruction::FMul: 5310 case Instruction::UDiv: 5311 case Instruction::SDiv: 5312 case Instruction::FDiv: 5313 case Instruction::URem: 5314 case Instruction::SRem: 5315 case Instruction::FRem: 5316 case Instruction::Shl: 5317 case Instruction::LShr: 5318 case Instruction::AShr: 5319 case Instruction::And: 5320 case Instruction::Or: 5321 case Instruction::Xor: 5322 return TreeEntry::Vectorize; 5323 case Instruction::GetElementPtr: { 5324 // We don't combine GEPs with complicated (nested) indexing. 5325 for (Value *V : VL) { 5326 auto *I = dyn_cast<GetElementPtrInst>(V); 5327 if (!I) 5328 continue; 5329 if (I->getNumOperands() != 2) { 5330 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 5331 return TreeEntry::NeedToGather; 5332 } 5333 } 5334 5335 // We can't combine several GEPs into one vector if they operate on 5336 // different types. 5337 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); 5338 for (Value *V : VL) { 5339 auto *GEP = dyn_cast<GEPOperator>(V); 5340 if (!GEP) 5341 continue; 5342 Type *CurTy = GEP->getSourceElementType(); 5343 if (Ty0 != CurTy) { 5344 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 5345 return TreeEntry::NeedToGather; 5346 } 5347 } 5348 5349 // We don't combine GEPs with non-constant indexes. 5350 Type *Ty1 = VL0->getOperand(1)->getType(); 5351 for (Value *V : VL) { 5352 auto *I = dyn_cast<GetElementPtrInst>(V); 5353 if (!I) 5354 continue; 5355 auto *Op = I->getOperand(1); 5356 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 5357 (Op->getType() != Ty1 && 5358 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 5359 Op->getType()->getScalarSizeInBits() > 5360 DL->getIndexSizeInBits( 5361 V->getType()->getPointerAddressSpace())))) { 5362 LLVM_DEBUG( 5363 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 5364 return TreeEntry::NeedToGather; 5365 } 5366 } 5367 5368 return TreeEntry::Vectorize; 5369 } 5370 case Instruction::Store: { 5371 // Check if the stores are consecutive or if we need to swizzle them. 5372 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); 5373 // Avoid types that are padded when being allocated as scalars, while 5374 // being packed together in a vector (such as i1). 5375 if (DL->getTypeSizeInBits(ScalarTy) != 5376 DL->getTypeAllocSizeInBits(ScalarTy)) { 5377 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); 5378 return TreeEntry::NeedToGather; 5379 } 5380 // Make sure all stores in the bundle are simple - we can't vectorize 5381 // atomic or volatile stores. 5382 for (Value *V : VL) { 5383 auto *SI = cast<StoreInst>(V); 5384 if (!SI->isSimple()) { 5385 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); 5386 return TreeEntry::NeedToGather; 5387 } 5388 PointerOps.push_back(SI->getPointerOperand()); 5389 } 5390 5391 // Check the order of pointer operands. 5392 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { 5393 Value *Ptr0; 5394 Value *PtrN; 5395 if (CurrentOrder.empty()) { 5396 Ptr0 = PointerOps.front(); 5397 PtrN = PointerOps.back(); 5398 } else { 5399 Ptr0 = PointerOps[CurrentOrder.front()]; 5400 PtrN = PointerOps[CurrentOrder.back()]; 5401 } 5402 std::optional<int> Dist = 5403 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 5404 // Check that the sorted pointer operands are consecutive. 5405 if (static_cast<unsigned>(*Dist) == VL.size() - 1) 5406 return TreeEntry::Vectorize; 5407 } 5408 5409 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 5410 return TreeEntry::NeedToGather; 5411 } 5412 case Instruction::Call: { 5413 // Check if the calls are all to the same vectorizable intrinsic or 5414 // library function. 5415 CallInst *CI = cast<CallInst>(VL0); 5416 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5417 5418 VFShape Shape = VFShape::get( 5419 *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), 5420 false /*HasGlobalPred*/); 5421 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 5422 5423 if (!VecFunc && !isTriviallyVectorizable(ID)) { 5424 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 5425 return TreeEntry::NeedToGather; 5426 } 5427 Function *F = CI->getCalledFunction(); 5428 unsigned NumArgs = CI->arg_size(); 5429 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); 5430 for (unsigned J = 0; J != NumArgs; ++J) 5431 if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) 5432 ScalarArgs[J] = CI->getArgOperand(J); 5433 for (Value *V : VL) { 5434 CallInst *CI2 = dyn_cast<CallInst>(V); 5435 if (!CI2 || CI2->getCalledFunction() != F || 5436 getVectorIntrinsicIDForCall(CI2, TLI) != ID || 5437 (VecFunc && 5438 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || 5439 !CI->hasIdenticalOperandBundleSchema(*CI2)) { 5440 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V 5441 << "\n"); 5442 return TreeEntry::NeedToGather; 5443 } 5444 // Some intrinsics have scalar arguments and should be same in order for 5445 // them to be vectorized. 5446 for (unsigned J = 0; J != NumArgs; ++J) { 5447 if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { 5448 Value *A1J = CI2->getArgOperand(J); 5449 if (ScalarArgs[J] != A1J) { 5450 LLVM_DEBUG(dbgs() 5451 << "SLP: mismatched arguments in call:" << *CI 5452 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n"); 5453 return TreeEntry::NeedToGather; 5454 } 5455 } 5456 } 5457 // Verify that the bundle operands are identical between the two calls. 5458 if (CI->hasOperandBundles() && 5459 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), 5460 CI->op_begin() + CI->getBundleOperandsEndIndex(), 5461 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { 5462 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI 5463 << "!=" << *V << '\n'); 5464 return TreeEntry::NeedToGather; 5465 } 5466 } 5467 5468 return TreeEntry::Vectorize; 5469 } 5470 case Instruction::ShuffleVector: { 5471 // If this is not an alternate sequence of opcode like add-sub 5472 // then do not vectorize this instruction. 5473 if (!S.isAltShuffle()) { 5474 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 5475 return TreeEntry::NeedToGather; 5476 } 5477 return TreeEntry::Vectorize; 5478 } 5479 default: 5480 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 5481 return TreeEntry::NeedToGather; 5482 } 5483 } 5484 5485 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, 5486 const EdgeInfo &UserTreeIdx) { 5487 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); 5488 5489 SmallVector<int> ReuseShuffleIndicies; 5490 SmallVector<Value *> UniqueValues; 5491 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, 5492 &UserTreeIdx, 5493 this](const InstructionsState &S) { 5494 // Check that every instruction appears once in this bundle. 5495 DenseMap<Value *, unsigned> UniquePositions(VL.size()); 5496 for (Value *V : VL) { 5497 if (isConstant(V)) { 5498 ReuseShuffleIndicies.emplace_back( 5499 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size()); 5500 UniqueValues.emplace_back(V); 5501 continue; 5502 } 5503 auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 5504 ReuseShuffleIndicies.emplace_back(Res.first->second); 5505 if (Res.second) 5506 UniqueValues.emplace_back(V); 5507 } 5508 size_t NumUniqueScalarValues = UniqueValues.size(); 5509 if (NumUniqueScalarValues == VL.size()) { 5510 ReuseShuffleIndicies.clear(); 5511 } else { 5512 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); 5513 if (NumUniqueScalarValues <= 1 || 5514 (UniquePositions.size() == 1 && all_of(UniqueValues, 5515 [](Value *V) { 5516 return isa<UndefValue>(V) || 5517 !isConstant(V); 5518 })) || 5519 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) { 5520 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 5521 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5522 return false; 5523 } 5524 VL = UniqueValues; 5525 } 5526 return true; 5527 }; 5528 5529 InstructionsState S = getSameOpcode(VL, *TLI); 5530 5531 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of 5532 // a load), in which case peek through to include it in the tree, without 5533 // ballooning over-budget. 5534 if (Depth >= RecursionMaxDepth && 5535 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && 5536 VL.size() >= 4 && 5537 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { 5538 return match(I, 5539 m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && 5540 cast<Instruction>(I)->getOpcode() == 5541 cast<Instruction>(S.MainOp)->getOpcode(); 5542 })))) { 5543 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 5544 if (TryToFindDuplicates(S)) 5545 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5546 ReuseShuffleIndicies); 5547 return; 5548 } 5549 5550 // Don't handle scalable vectors 5551 if (S.getOpcode() == Instruction::ExtractElement && 5552 isa<ScalableVectorType>( 5553 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { 5554 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); 5555 if (TryToFindDuplicates(S)) 5556 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5557 ReuseShuffleIndicies); 5558 return; 5559 } 5560 5561 // Don't handle vectors. 5562 if (S.OpValue->getType()->isVectorTy() && 5563 !isa<InsertElementInst>(S.OpValue)) { 5564 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 5565 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5566 return; 5567 } 5568 5569 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) 5570 if (SI->getValueOperand()->getType()->isVectorTy()) { 5571 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 5572 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5573 return; 5574 } 5575 5576 // If all of the operands are identical or constant we have a simple solution. 5577 // If we deal with insert/extract instructions, they all must have constant 5578 // indices, otherwise we should gather them, not try to vectorize. 5579 // If alternate op node with 2 elements with gathered operands - do not 5580 // vectorize. 5581 auto &&NotProfitableForVectorization = [&S, this, 5582 Depth](ArrayRef<Value *> VL) { 5583 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) 5584 return false; 5585 if (VectorizableTree.size() < MinTreeSize) 5586 return false; 5587 if (Depth >= RecursionMaxDepth - 1) 5588 return true; 5589 // Check if all operands are extracts, part of vector node or can build a 5590 // regular vectorize node. 5591 SmallVector<unsigned, 2> InstsCount(VL.size(), 0); 5592 for (Value *V : VL) { 5593 auto *I = cast<Instruction>(V); 5594 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) { 5595 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op); 5596 })); 5597 } 5598 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp); 5599 if ((IsCommutative && 5600 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || 5601 (!IsCommutative && 5602 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; }))) 5603 return true; 5604 assert(VL.size() == 2 && "Expected only 2 alternate op instructions."); 5605 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; 5606 auto *I1 = cast<Instruction>(VL.front()); 5607 auto *I2 = cast<Instruction>(VL.back()); 5608 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) 5609 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 5610 I2->getOperand(Op)); 5611 if (static_cast<unsigned>(count_if( 5612 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 5613 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 5614 })) >= S.MainOp->getNumOperands() / 2) 5615 return false; 5616 if (S.MainOp->getNumOperands() > 2) 5617 return true; 5618 if (IsCommutative) { 5619 // Check permuted operands. 5620 Candidates.clear(); 5621 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) 5622 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 5623 I2->getOperand((Op + 1) % E)); 5624 if (any_of( 5625 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 5626 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 5627 })) 5628 return false; 5629 } 5630 return true; 5631 }; 5632 SmallVector<unsigned> SortedIndices; 5633 BasicBlock *BB = nullptr; 5634 bool IsScatterVectorizeUserTE = 5635 UserTreeIdx.UserTE && 5636 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; 5637 bool AreAllSameInsts = 5638 (S.getOpcode() && allSameBlock(VL)) || 5639 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE && 5640 VL.size() > 2 && 5641 all_of(VL, 5642 [&BB](Value *V) { 5643 auto *I = dyn_cast<GetElementPtrInst>(V); 5644 if (!I) 5645 return doesNotNeedToBeScheduled(V); 5646 if (!BB) 5647 BB = I->getParent(); 5648 return BB == I->getParent() && I->getNumOperands() == 2; 5649 }) && 5650 BB && 5651 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, 5652 SortedIndices)); 5653 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) || 5654 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( 5655 S.OpValue) && 5656 !all_of(VL, isVectorLikeInstWithConstOps)) || 5657 NotProfitableForVectorization(VL)) { 5658 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); 5659 if (TryToFindDuplicates(S)) 5660 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5661 ReuseShuffleIndicies); 5662 return; 5663 } 5664 5665 // We now know that this is a vector of instructions of the same type from 5666 // the same block. 5667 5668 // Don't vectorize ephemeral values. 5669 if (!EphValues.empty()) { 5670 for (Value *V : VL) { 5671 if (EphValues.count(V)) { 5672 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 5673 << ") is ephemeral.\n"); 5674 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5675 return; 5676 } 5677 } 5678 } 5679 5680 // Check if this is a duplicate of another entry. 5681 if (TreeEntry *E = getTreeEntry(S.OpValue)) { 5682 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); 5683 if (!E->isSame(VL)) { 5684 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 5685 if (TryToFindDuplicates(S)) 5686 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5687 ReuseShuffleIndicies); 5688 return; 5689 } 5690 // Record the reuse of the tree node. FIXME, currently this is only used to 5691 // properly draw the graph rather than for the actual vectorization. 5692 E->UserTreeIndices.push_back(UserTreeIdx); 5693 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue 5694 << ".\n"); 5695 return; 5696 } 5697 5698 // Check that none of the instructions in the bundle are already in the tree. 5699 for (Value *V : VL) { 5700 if (!IsScatterVectorizeUserTE && !isa<Instruction>(V)) 5701 continue; 5702 if (getTreeEntry(V)) { 5703 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 5704 << ") is already in tree.\n"); 5705 if (TryToFindDuplicates(S)) 5706 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5707 ReuseShuffleIndicies); 5708 return; 5709 } 5710 } 5711 5712 // The reduction nodes (stored in UserIgnoreList) also should stay scalar. 5713 if (UserIgnoreList && !UserIgnoreList->empty()) { 5714 for (Value *V : VL) { 5715 if (UserIgnoreList && UserIgnoreList->contains(V)) { 5716 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 5717 if (TryToFindDuplicates(S)) 5718 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5719 ReuseShuffleIndicies); 5720 return; 5721 } 5722 } 5723 } 5724 5725 // Special processing for sorted pointers for ScatterVectorize node with 5726 // constant indeces only. 5727 if (AreAllSameInsts && UserTreeIdx.UserTE && 5728 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && 5729 !(S.getOpcode() && allSameBlock(VL))) { 5730 assert(S.OpValue->getType()->isPointerTy() && 5731 count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 5732 2 && 5733 "Expected pointers only."); 5734 // Reset S to make it GetElementPtr kind of node. 5735 const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); 5736 assert(It != VL.end() && "Expected at least one GEP."); 5737 S = getSameOpcode(*It, *TLI); 5738 } 5739 5740 // Check that all of the users of the scalars that we want to vectorize are 5741 // schedulable. 5742 auto *VL0 = cast<Instruction>(S.OpValue); 5743 BB = VL0->getParent(); 5744 5745 if (!DT->isReachableFromEntry(BB)) { 5746 // Don't go into unreachable blocks. They may contain instructions with 5747 // dependency cycles which confuse the final scheduling. 5748 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 5749 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5750 return; 5751 } 5752 5753 // Don't go into catchswitch blocks, which can happen with PHIs. 5754 // Such blocks can only have PHIs and the catchswitch. There is no 5755 // place to insert a shuffle if we need to, so just avoid that issue. 5756 if (isa<CatchSwitchInst>(BB->getTerminator())) { 5757 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); 5758 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 5759 return; 5760 } 5761 5762 // Check that every instruction appears once in this bundle. 5763 if (!TryToFindDuplicates(S)) 5764 return; 5765 5766 // Perform specific checks for each particular instruction kind. 5767 OrdersType CurrentOrder; 5768 SmallVector<Value *> PointerOps; 5769 TreeEntry::EntryState State = getScalarsVectorizationState( 5770 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); 5771 if (State == TreeEntry::NeedToGather) { 5772 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5773 ReuseShuffleIndicies); 5774 return; 5775 } 5776 5777 auto &BSRef = BlocksSchedules[BB]; 5778 if (!BSRef) 5779 BSRef = std::make_unique<BlockScheduling>(BB); 5780 5781 BlockScheduling &BS = *BSRef; 5782 5783 std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); 5784 #ifdef EXPENSIVE_CHECKS 5785 // Make sure we didn't break any internal invariants 5786 BS.verify(); 5787 #endif 5788 if (!Bundle) { 5789 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 5790 assert((!BS.getScheduleData(VL0) || 5791 !BS.getScheduleData(VL0)->isPartOfBundle()) && 5792 "tryScheduleBundle should cancelScheduling on failure"); 5793 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 5794 ReuseShuffleIndicies); 5795 return; 5796 } 5797 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 5798 5799 unsigned ShuffleOrOp = S.isAltShuffle() ? 5800 (unsigned) Instruction::ShuffleVector : S.getOpcode(); 5801 switch (ShuffleOrOp) { 5802 case Instruction::PHI: { 5803 auto *PH = cast<PHINode>(VL0); 5804 5805 TreeEntry *TE = 5806 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); 5807 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 5808 5809 // Keeps the reordered operands to avoid code duplication. 5810 SmallVector<ValueList, 2> OperandsVec; 5811 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { 5812 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) { 5813 ValueList Operands(VL.size(), PoisonValue::get(PH->getType())); 5814 TE->setOperand(I, Operands); 5815 OperandsVec.push_back(Operands); 5816 continue; 5817 } 5818 ValueList Operands; 5819 // Prepare the operand vector. 5820 for (Value *V : VL) 5821 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( 5822 PH->getIncomingBlock(I))); 5823 TE->setOperand(I, Operands); 5824 OperandsVec.push_back(Operands); 5825 } 5826 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) 5827 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); 5828 return; 5829 } 5830 case Instruction::ExtractValue: 5831 case Instruction::ExtractElement: { 5832 if (CurrentOrder.empty()) { 5833 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); 5834 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5835 ReuseShuffleIndicies); 5836 // This is a special case, as it does not gather, but at the same time 5837 // we are not extending buildTree_rec() towards the operands. 5838 ValueList Op0; 5839 Op0.assign(VL.size(), VL0->getOperand(0)); 5840 VectorizableTree.back()->setOperand(0, Op0); 5841 return; 5842 } 5843 LLVM_DEBUG({ 5844 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " 5845 "with order"; 5846 for (unsigned Idx : CurrentOrder) 5847 dbgs() << " " << Idx; 5848 dbgs() << "\n"; 5849 }); 5850 fixupOrderingIndices(CurrentOrder); 5851 // Insert new order with initial value 0, if it does not exist, 5852 // otherwise return the iterator to the existing one. 5853 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5854 ReuseShuffleIndicies, CurrentOrder); 5855 // This is a special case, as it does not gather, but at the same time 5856 // we are not extending buildTree_rec() towards the operands. 5857 ValueList Op0; 5858 Op0.assign(VL.size(), VL0->getOperand(0)); 5859 VectorizableTree.back()->setOperand(0, Op0); 5860 return; 5861 } 5862 case Instruction::InsertElement: { 5863 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); 5864 5865 auto OrdCompare = [](const std::pair<int, int> &P1, 5866 const std::pair<int, int> &P2) { 5867 return P1.first > P2.first; 5868 }; 5869 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, 5870 decltype(OrdCompare)> 5871 Indices(OrdCompare); 5872 for (int I = 0, E = VL.size(); I < E; ++I) { 5873 unsigned Idx = *getInsertIndex(VL[I]); 5874 Indices.emplace(Idx, I); 5875 } 5876 OrdersType CurrentOrder(VL.size(), VL.size()); 5877 bool IsIdentity = true; 5878 for (int I = 0, E = VL.size(); I < E; ++I) { 5879 CurrentOrder[Indices.top().second] = I; 5880 IsIdentity &= Indices.top().second == I; 5881 Indices.pop(); 5882 } 5883 if (IsIdentity) 5884 CurrentOrder.clear(); 5885 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5886 std::nullopt, CurrentOrder); 5887 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); 5888 5889 constexpr int NumOps = 2; 5890 ValueList VectorOperands[NumOps]; 5891 for (int I = 0; I < NumOps; ++I) { 5892 for (Value *V : VL) 5893 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I)); 5894 5895 TE->setOperand(I, VectorOperands[I]); 5896 } 5897 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1}); 5898 return; 5899 } 5900 case Instruction::Load: { 5901 // Check that a vectorized load would load the same memory as a scalar 5902 // load. For example, we don't want to vectorize loads that are smaller 5903 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 5904 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 5905 // from such a struct, we read/write packed bits disagreeing with the 5906 // unvectorized version. 5907 TreeEntry *TE = nullptr; 5908 switch (State) { 5909 case TreeEntry::Vectorize: 5910 if (CurrentOrder.empty()) { 5911 // Original loads are consecutive and does not require reordering. 5912 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5913 ReuseShuffleIndicies); 5914 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 5915 } else { 5916 fixupOrderingIndices(CurrentOrder); 5917 // Need to reorder. 5918 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5919 ReuseShuffleIndicies, CurrentOrder); 5920 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); 5921 } 5922 TE->setOperandsInOrder(); 5923 break; 5924 case TreeEntry::ScatterVectorize: 5925 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 5926 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, 5927 UserTreeIdx, ReuseShuffleIndicies); 5928 TE->setOperandsInOrder(); 5929 buildTree_rec(PointerOps, Depth + 1, {TE, 0}); 5930 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); 5931 break; 5932 case TreeEntry::NeedToGather: 5933 llvm_unreachable("Unexpected loads state."); 5934 } 5935 return; 5936 } 5937 case Instruction::ZExt: 5938 case Instruction::SExt: 5939 case Instruction::FPToUI: 5940 case Instruction::FPToSI: 5941 case Instruction::FPExt: 5942 case Instruction::PtrToInt: 5943 case Instruction::IntToPtr: 5944 case Instruction::SIToFP: 5945 case Instruction::UIToFP: 5946 case Instruction::Trunc: 5947 case Instruction::FPTrunc: 5948 case Instruction::BitCast: { 5949 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5950 ReuseShuffleIndicies); 5951 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 5952 5953 TE->setOperandsInOrder(); 5954 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 5955 ValueList Operands; 5956 // Prepare the operand vector. 5957 for (Value *V : VL) 5958 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 5959 5960 buildTree_rec(Operands, Depth + 1, {TE, i}); 5961 } 5962 return; 5963 } 5964 case Instruction::ICmp: 5965 case Instruction::FCmp: { 5966 // Check that all of the compares have the same predicate. 5967 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 5968 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 5969 ReuseShuffleIndicies); 5970 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 5971 5972 ValueList Left, Right; 5973 if (cast<CmpInst>(VL0)->isCommutative()) { 5974 // Commutative predicate - collect + sort operands of the instructions 5975 // so that each side is more likely to have the same opcode. 5976 assert(P0 == CmpInst::getSwappedPredicate(P0) && 5977 "Commutative Predicate mismatch"); 5978 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); 5979 } else { 5980 // Collect operands - commute if it uses the swapped predicate. 5981 for (Value *V : VL) { 5982 auto *Cmp = cast<CmpInst>(V); 5983 Value *LHS = Cmp->getOperand(0); 5984 Value *RHS = Cmp->getOperand(1); 5985 if (Cmp->getPredicate() != P0) 5986 std::swap(LHS, RHS); 5987 Left.push_back(LHS); 5988 Right.push_back(RHS); 5989 } 5990 } 5991 TE->setOperand(0, Left); 5992 TE->setOperand(1, Right); 5993 buildTree_rec(Left, Depth + 1, {TE, 0}); 5994 buildTree_rec(Right, Depth + 1, {TE, 1}); 5995 return; 5996 } 5997 case Instruction::Select: 5998 case Instruction::FNeg: 5999 case Instruction::Add: 6000 case Instruction::FAdd: 6001 case Instruction::Sub: 6002 case Instruction::FSub: 6003 case Instruction::Mul: 6004 case Instruction::FMul: 6005 case Instruction::UDiv: 6006 case Instruction::SDiv: 6007 case Instruction::FDiv: 6008 case Instruction::URem: 6009 case Instruction::SRem: 6010 case Instruction::FRem: 6011 case Instruction::Shl: 6012 case Instruction::LShr: 6013 case Instruction::AShr: 6014 case Instruction::And: 6015 case Instruction::Or: 6016 case Instruction::Xor: { 6017 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6018 ReuseShuffleIndicies); 6019 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); 6020 6021 // Sort operands of the instructions so that each side is more likely to 6022 // have the same opcode. 6023 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 6024 ValueList Left, Right; 6025 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); 6026 TE->setOperand(0, Left); 6027 TE->setOperand(1, Right); 6028 buildTree_rec(Left, Depth + 1, {TE, 0}); 6029 buildTree_rec(Right, Depth + 1, {TE, 1}); 6030 return; 6031 } 6032 6033 TE->setOperandsInOrder(); 6034 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 6035 ValueList Operands; 6036 // Prepare the operand vector. 6037 for (Value *V : VL) 6038 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 6039 6040 buildTree_rec(Operands, Depth + 1, {TE, i}); 6041 } 6042 return; 6043 } 6044 case Instruction::GetElementPtr: { 6045 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6046 ReuseShuffleIndicies); 6047 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 6048 SmallVector<ValueList, 2> Operands(2); 6049 // Prepare the operand vector for pointer operands. 6050 for (Value *V : VL) { 6051 auto *GEP = dyn_cast<GetElementPtrInst>(V); 6052 if (!GEP) { 6053 Operands.front().push_back(V); 6054 continue; 6055 } 6056 Operands.front().push_back(GEP->getPointerOperand()); 6057 } 6058 TE->setOperand(0, Operands.front()); 6059 // Need to cast all indices to the same type before vectorization to 6060 // avoid crash. 6061 // Required to be able to find correct matches between different gather 6062 // nodes and reuse the vectorized values rather than trying to gather them 6063 // again. 6064 int IndexIdx = 1; 6065 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); 6066 Type *Ty = all_of(VL, 6067 [VL0Ty, IndexIdx](Value *V) { 6068 auto *GEP = dyn_cast<GetElementPtrInst>(V); 6069 if (!GEP) 6070 return true; 6071 return VL0Ty == GEP->getOperand(IndexIdx)->getType(); 6072 }) 6073 ? VL0Ty 6074 : DL->getIndexType(cast<GetElementPtrInst>(VL0) 6075 ->getPointerOperandType() 6076 ->getScalarType()); 6077 // Prepare the operand vector. 6078 for (Value *V : VL) { 6079 auto *I = dyn_cast<GetElementPtrInst>(V); 6080 if (!I) { 6081 Operands.back().push_back( 6082 ConstantInt::get(Ty, 0, /*isSigned=*/false)); 6083 continue; 6084 } 6085 auto *Op = I->getOperand(IndexIdx); 6086 auto *CI = dyn_cast<ConstantInt>(Op); 6087 if (!CI) 6088 Operands.back().push_back(Op); 6089 else 6090 Operands.back().push_back(ConstantExpr::getIntegerCast( 6091 CI, Ty, CI->getValue().isSignBitSet())); 6092 } 6093 TE->setOperand(IndexIdx, Operands.back()); 6094 6095 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) 6096 buildTree_rec(Operands[I], Depth + 1, {TE, I}); 6097 return; 6098 } 6099 case Instruction::Store: { 6100 // Check if the stores are consecutive or if we need to swizzle them. 6101 ValueList Operands(VL.size()); 6102 auto *OIter = Operands.begin(); 6103 for (Value *V : VL) { 6104 auto *SI = cast<StoreInst>(V); 6105 *OIter = SI->getValueOperand(); 6106 ++OIter; 6107 } 6108 // Check that the sorted pointer operands are consecutive. 6109 if (CurrentOrder.empty()) { 6110 // Original stores are consecutive and does not require reordering. 6111 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6112 ReuseShuffleIndicies); 6113 TE->setOperandsInOrder(); 6114 buildTree_rec(Operands, Depth + 1, {TE, 0}); 6115 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 6116 } else { 6117 fixupOrderingIndices(CurrentOrder); 6118 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6119 ReuseShuffleIndicies, CurrentOrder); 6120 TE->setOperandsInOrder(); 6121 buildTree_rec(Operands, Depth + 1, {TE, 0}); 6122 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); 6123 } 6124 return; 6125 } 6126 case Instruction::Call: { 6127 // Check if the calls are all to the same vectorizable intrinsic or 6128 // library function. 6129 CallInst *CI = cast<CallInst>(VL0); 6130 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6131 6132 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6133 ReuseShuffleIndicies); 6134 TE->setOperandsInOrder(); 6135 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { 6136 // For scalar operands no need to to create an entry since no need to 6137 // vectorize it. 6138 if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) 6139 continue; 6140 ValueList Operands; 6141 // Prepare the operand vector. 6142 for (Value *V : VL) { 6143 auto *CI2 = cast<CallInst>(V); 6144 Operands.push_back(CI2->getArgOperand(i)); 6145 } 6146 buildTree_rec(Operands, Depth + 1, {TE, i}); 6147 } 6148 return; 6149 } 6150 case Instruction::ShuffleVector: { 6151 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 6152 ReuseShuffleIndicies); 6153 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 6154 6155 // Reorder operands if reordering would enable vectorization. 6156 auto *CI = dyn_cast<CmpInst>(VL0); 6157 if (isa<BinaryOperator>(VL0) || CI) { 6158 ValueList Left, Right; 6159 if (!CI || all_of(VL, [](Value *V) { 6160 return cast<CmpInst>(V)->isCommutative(); 6161 })) { 6162 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, 6163 *this); 6164 } else { 6165 auto *MainCI = cast<CmpInst>(S.MainOp); 6166 auto *AltCI = cast<CmpInst>(S.AltOp); 6167 CmpInst::Predicate MainP = MainCI->getPredicate(); 6168 CmpInst::Predicate AltP = AltCI->getPredicate(); 6169 assert(MainP != AltP && 6170 "Expected different main/alternate predicates."); 6171 // Collect operands - commute if it uses the swapped predicate or 6172 // alternate operation. 6173 for (Value *V : VL) { 6174 auto *Cmp = cast<CmpInst>(V); 6175 Value *LHS = Cmp->getOperand(0); 6176 Value *RHS = Cmp->getOperand(1); 6177 6178 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { 6179 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 6180 std::swap(LHS, RHS); 6181 } else { 6182 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 6183 std::swap(LHS, RHS); 6184 } 6185 Left.push_back(LHS); 6186 Right.push_back(RHS); 6187 } 6188 } 6189 TE->setOperand(0, Left); 6190 TE->setOperand(1, Right); 6191 buildTree_rec(Left, Depth + 1, {TE, 0}); 6192 buildTree_rec(Right, Depth + 1, {TE, 1}); 6193 return; 6194 } 6195 6196 TE->setOperandsInOrder(); 6197 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 6198 ValueList Operands; 6199 // Prepare the operand vector. 6200 for (Value *V : VL) 6201 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 6202 6203 buildTree_rec(Operands, Depth + 1, {TE, i}); 6204 } 6205 return; 6206 } 6207 default: 6208 break; 6209 } 6210 llvm_unreachable("Unexpected vectorization of the instructions."); 6211 } 6212 6213 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { 6214 unsigned N = 1; 6215 Type *EltTy = T; 6216 6217 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) { 6218 if (auto *ST = dyn_cast<StructType>(EltTy)) { 6219 // Check that struct is homogeneous. 6220 for (const auto *Ty : ST->elements()) 6221 if (Ty != *ST->element_begin()) 6222 return 0; 6223 N *= ST->getNumElements(); 6224 EltTy = *ST->element_begin(); 6225 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { 6226 N *= AT->getNumElements(); 6227 EltTy = AT->getElementType(); 6228 } else { 6229 auto *VT = cast<FixedVectorType>(EltTy); 6230 N *= VT->getNumElements(); 6231 EltTy = VT->getElementType(); 6232 } 6233 } 6234 6235 if (!isValidElementType(EltTy)) 6236 return 0; 6237 uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); 6238 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || 6239 VTSize != DL.getTypeStoreSizeInBits(T)) 6240 return 0; 6241 return N; 6242 } 6243 6244 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 6245 SmallVectorImpl<unsigned> &CurrentOrder) const { 6246 const auto *It = find_if(VL, [](Value *V) { 6247 return isa<ExtractElementInst, ExtractValueInst>(V); 6248 }); 6249 assert(It != VL.end() && "Expected at least one extract instruction."); 6250 auto *E0 = cast<Instruction>(*It); 6251 assert(all_of(VL, 6252 [](Value *V) { 6253 return isa<UndefValue, ExtractElementInst, ExtractValueInst>( 6254 V); 6255 }) && 6256 "Invalid opcode"); 6257 // Check if all of the extracts come from the same vector and from the 6258 // correct offset. 6259 Value *Vec = E0->getOperand(0); 6260 6261 CurrentOrder.clear(); 6262 6263 // We have to extract from a vector/aggregate with the same number of elements. 6264 unsigned NElts; 6265 if (E0->getOpcode() == Instruction::ExtractValue) { 6266 const DataLayout &DL = E0->getModule()->getDataLayout(); 6267 NElts = canMapToVector(Vec->getType(), DL); 6268 if (!NElts) 6269 return false; 6270 // Check if load can be rewritten as load of vector. 6271 LoadInst *LI = dyn_cast<LoadInst>(Vec); 6272 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) 6273 return false; 6274 } else { 6275 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); 6276 } 6277 6278 if (NElts != VL.size()) 6279 return false; 6280 6281 // Check that all of the indices extract from the correct offset. 6282 bool ShouldKeepOrder = true; 6283 unsigned E = VL.size(); 6284 // Assign to all items the initial value E + 1 so we can check if the extract 6285 // instruction index was used already. 6286 // Also, later we can check that all the indices are used and we have a 6287 // consecutive access in the extract instructions, by checking that no 6288 // element of CurrentOrder still has value E + 1. 6289 CurrentOrder.assign(E, E); 6290 unsigned I = 0; 6291 for (; I < E; ++I) { 6292 auto *Inst = dyn_cast<Instruction>(VL[I]); 6293 if (!Inst) 6294 continue; 6295 if (Inst->getOperand(0) != Vec) 6296 break; 6297 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) 6298 if (isa<UndefValue>(EE->getIndexOperand())) 6299 continue; 6300 std::optional<unsigned> Idx = getExtractIndex(Inst); 6301 if (!Idx) 6302 break; 6303 const unsigned ExtIdx = *Idx; 6304 if (ExtIdx != I) { 6305 if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) 6306 break; 6307 ShouldKeepOrder = false; 6308 CurrentOrder[ExtIdx] = I; 6309 } else { 6310 if (CurrentOrder[I] != E) 6311 break; 6312 CurrentOrder[I] = I; 6313 } 6314 } 6315 if (I < E) { 6316 CurrentOrder.clear(); 6317 return false; 6318 } 6319 if (ShouldKeepOrder) 6320 CurrentOrder.clear(); 6321 6322 return ShouldKeepOrder; 6323 } 6324 6325 bool BoUpSLP::areAllUsersVectorized(Instruction *I, 6326 ArrayRef<Value *> VectorizedVals) const { 6327 return (I->hasOneUse() && is_contained(VectorizedVals, I)) || 6328 all_of(I->users(), [this](User *U) { 6329 return ScalarToTreeEntry.count(U) > 0 || 6330 isVectorLikeInstWithConstOps(U) || 6331 (isa<ExtractElementInst>(U) && MustGather.contains(U)); 6332 }); 6333 } 6334 6335 static std::pair<InstructionCost, InstructionCost> 6336 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, 6337 TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { 6338 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6339 6340 // Calculate the cost of the scalar and vector calls. 6341 SmallVector<Type *, 4> VecTys; 6342 for (Use &Arg : CI->args()) 6343 VecTys.push_back( 6344 FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); 6345 FastMathFlags FMF; 6346 if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) 6347 FMF = FPCI->getFastMathFlags(); 6348 SmallVector<const Value *> Arguments(CI->args()); 6349 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, 6350 dyn_cast<IntrinsicInst>(CI)); 6351 auto IntrinsicCost = 6352 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); 6353 6354 auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 6355 VecTy->getNumElements())), 6356 false /*HasGlobalPred*/); 6357 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 6358 auto LibCost = IntrinsicCost; 6359 if (!CI->isNoBuiltin() && VecFunc) { 6360 // Calculate the cost of the vector library call. 6361 // If the corresponding vector call is cheaper, return its cost. 6362 LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, 6363 TTI::TCK_RecipThroughput); 6364 } 6365 return {IntrinsicCost, LibCost}; 6366 } 6367 6368 /// Build shuffle mask for shuffle graph entries and lists of main and alternate 6369 /// operations operands. 6370 static void 6371 buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, 6372 ArrayRef<int> ReusesIndices, 6373 const function_ref<bool(Instruction *)> IsAltOp, 6374 SmallVectorImpl<int> &Mask, 6375 SmallVectorImpl<Value *> *OpScalars = nullptr, 6376 SmallVectorImpl<Value *> *AltScalars = nullptr) { 6377 unsigned Sz = VL.size(); 6378 Mask.assign(Sz, PoisonMaskElem); 6379 SmallVector<int> OrderMask; 6380 if (!ReorderIndices.empty()) 6381 inversePermutation(ReorderIndices, OrderMask); 6382 for (unsigned I = 0; I < Sz; ++I) { 6383 unsigned Idx = I; 6384 if (!ReorderIndices.empty()) 6385 Idx = OrderMask[I]; 6386 auto *OpInst = cast<Instruction>(VL[Idx]); 6387 if (IsAltOp(OpInst)) { 6388 Mask[I] = Sz + Idx; 6389 if (AltScalars) 6390 AltScalars->push_back(OpInst); 6391 } else { 6392 Mask[I] = Idx; 6393 if (OpScalars) 6394 OpScalars->push_back(OpInst); 6395 } 6396 } 6397 if (!ReusesIndices.empty()) { 6398 SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem); 6399 transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { 6400 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; 6401 }); 6402 Mask.swap(NewMask); 6403 } 6404 } 6405 6406 static bool isAlternateInstruction(const Instruction *I, 6407 const Instruction *MainOp, 6408 const Instruction *AltOp, 6409 const TargetLibraryInfo &TLI) { 6410 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) { 6411 auto *AltCI = cast<CmpInst>(AltOp); 6412 CmpInst::Predicate MainP = MainCI->getPredicate(); 6413 CmpInst::Predicate AltP = AltCI->getPredicate(); 6414 assert(MainP != AltP && "Expected different main/alternate predicates."); 6415 auto *CI = cast<CmpInst>(I); 6416 if (isCmpSameOrSwapped(MainCI, CI, TLI)) 6417 return false; 6418 if (isCmpSameOrSwapped(AltCI, CI, TLI)) 6419 return true; 6420 CmpInst::Predicate P = CI->getPredicate(); 6421 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P); 6422 6423 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && 6424 "CmpInst expected to match either main or alternate predicate or " 6425 "their swap."); 6426 (void)AltP; 6427 return MainP != P && MainP != SwappedP; 6428 } 6429 return I->getOpcode() == AltOp->getOpcode(); 6430 } 6431 6432 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL, 6433 unsigned OpIdx) { 6434 assert(!VL.empty()); 6435 const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof)); 6436 const auto *Op0 = I0->getOperand(OpIdx); 6437 6438 const bool IsConstant = all_of(VL, [&](Value *V) { 6439 // TODO: We should allow undef elements here 6440 const auto *I = dyn_cast<Instruction>(V); 6441 if (!I) 6442 return true; 6443 auto *Op = I->getOperand(OpIdx); 6444 return isConstant(Op) && !isa<UndefValue>(Op); 6445 }); 6446 const bool IsUniform = all_of(VL, [&](Value *V) { 6447 // TODO: We should allow undef elements here 6448 const auto *I = dyn_cast<Instruction>(V); 6449 if (!I) 6450 return false; 6451 return I->getOperand(OpIdx) == Op0; 6452 }); 6453 const bool IsPowerOfTwo = all_of(VL, [&](Value *V) { 6454 // TODO: We should allow undef elements here 6455 const auto *I = dyn_cast<Instruction>(V); 6456 if (!I) { 6457 assert((isa<UndefValue>(V) || 6458 I0->getOpcode() == Instruction::GetElementPtr) && 6459 "Expected undef or GEP."); 6460 return true; 6461 } 6462 auto *Op = I->getOperand(OpIdx); 6463 if (auto *CI = dyn_cast<ConstantInt>(Op)) 6464 return CI->getValue().isPowerOf2(); 6465 return false; 6466 }); 6467 const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) { 6468 // TODO: We should allow undef elements here 6469 const auto *I = dyn_cast<Instruction>(V); 6470 if (!I) { 6471 assert((isa<UndefValue>(V) || 6472 I0->getOpcode() == Instruction::GetElementPtr) && 6473 "Expected undef or GEP."); 6474 return true; 6475 } 6476 const auto *Op = I->getOperand(OpIdx); 6477 if (auto *CI = dyn_cast<ConstantInt>(Op)) 6478 return CI->getValue().isNegatedPowerOf2(); 6479 return false; 6480 }); 6481 6482 TTI::OperandValueKind VK = TTI::OK_AnyValue; 6483 if (IsConstant && IsUniform) 6484 VK = TTI::OK_UniformConstantValue; 6485 else if (IsConstant) 6486 VK = TTI::OK_NonUniformConstantValue; 6487 else if (IsUniform) 6488 VK = TTI::OK_UniformValue; 6489 6490 TTI::OperandValueProperties VP = TTI::OP_None; 6491 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP; 6492 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP; 6493 6494 return {VK, VP}; 6495 } 6496 6497 namespace { 6498 /// The base class for shuffle instruction emission and shuffle cost estimation. 6499 class BaseShuffleAnalysis { 6500 protected: 6501 /// Checks if the mask is an identity mask. 6502 /// \param IsStrict if is true the function returns false if mask size does 6503 /// not match vector size. 6504 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy, 6505 bool IsStrict) { 6506 int Limit = Mask.size(); 6507 int VF = VecTy->getNumElements(); 6508 return (VF == Limit || !IsStrict) && 6509 all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && 6510 ShuffleVectorInst::isIdentityMask(Mask); 6511 } 6512 6513 /// Tries to combine 2 different masks into single one. 6514 /// \param LocalVF Vector length of the permuted input vector. \p Mask may 6515 /// change the size of the vector, \p LocalVF is the original size of the 6516 /// shuffled vector. 6517 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, 6518 ArrayRef<int> ExtMask) { 6519 unsigned VF = Mask.size(); 6520 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 6521 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 6522 if (ExtMask[I] == PoisonMaskElem) 6523 continue; 6524 int MaskedIdx = Mask[ExtMask[I] % VF]; 6525 NewMask[I] = 6526 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF; 6527 } 6528 Mask.swap(NewMask); 6529 } 6530 6531 /// Looks through shuffles trying to reduce final number of shuffles in the 6532 /// code. The function looks through the previously emitted shuffle 6533 /// instructions and properly mark indices in mask as undef. 6534 /// For example, given the code 6535 /// \code 6536 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 6537 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 6538 /// \endcode 6539 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 6540 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 6541 /// <0, 1, 2, 3> for the shuffle. 6542 /// If 2 operands are of different size, the smallest one will be resized and 6543 /// the mask recalculated properly. 6544 /// For example, given the code 6545 /// \code 6546 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 6547 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 6548 /// \endcode 6549 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 6550 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 6551 /// <0, 1, 2, 3> for the shuffle. 6552 /// So, it tries to transform permutations to simple vector merge, if 6553 /// possible. 6554 /// \param V The input vector which must be shuffled using the given \p Mask. 6555 /// If the better candidate is found, \p V is set to this best candidate 6556 /// vector. 6557 /// \param Mask The input mask for the shuffle. If the best candidate is found 6558 /// during looking-through-shuffles attempt, it is updated accordingly. 6559 /// \param SinglePermute true if the shuffle operation is originally a 6560 /// single-value-permutation. In this case the look-through-shuffles procedure 6561 /// may look for resizing shuffles as the best candidates. 6562 /// \return true if the shuffle results in the non-resizing identity shuffle 6563 /// (and thus can be ignored), false - otherwise. 6564 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask, 6565 bool SinglePermute) { 6566 Value *Op = V; 6567 ShuffleVectorInst *IdentityOp = nullptr; 6568 SmallVector<int> IdentityMask; 6569 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) { 6570 // Exit if not a fixed vector type or changing size shuffle. 6571 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType()); 6572 if (!SVTy) 6573 break; 6574 // Remember the identity or broadcast mask, if it is not a resizing 6575 // shuffle. If no better candidates are found, this Op and Mask will be 6576 // used in the final shuffle. 6577 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { 6578 if (!IdentityOp || !SinglePermute || 6579 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && 6580 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) { 6581 IdentityOp = SV; 6582 // Store current mask in the IdentityMask so later we did not lost 6583 // this info if IdentityOp is selected as the best candidate for the 6584 // permutation. 6585 IdentityMask.assign(Mask); 6586 } 6587 } 6588 // Remember the broadcast mask. If no better candidates are found, this Op 6589 // and Mask will be used in the final shuffle. 6590 // Zero splat can be used as identity too, since it might be used with 6591 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. 6592 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is 6593 // expensive, the analysis founds out, that the source vector is just a 6594 // broadcast, this original mask can be transformed to identity mask <0, 6595 // 1, 2, 3>. 6596 // \code 6597 // %0 = shuffle %v, poison, zeroinitalizer 6598 // %res = shuffle %0, poison, <3, 1, 2, 0> 6599 // \endcode 6600 // may be transformed to 6601 // \code 6602 // %0 = shuffle %v, poison, zeroinitalizer 6603 // %res = shuffle %0, poison, <0, 1, 2, 3> 6604 // \endcode 6605 if (SV->isZeroEltSplat()) { 6606 IdentityOp = SV; 6607 IdentityMask.assign(Mask); 6608 } 6609 int LocalVF = Mask.size(); 6610 if (auto *SVOpTy = 6611 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType())) 6612 LocalVF = SVOpTy->getNumElements(); 6613 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem); 6614 for (auto [Idx, I] : enumerate(Mask)) { 6615 if (I == PoisonMaskElem || 6616 static_cast<unsigned>(I) >= SV->getShuffleMask().size()) 6617 continue; 6618 ExtMask[Idx] = SV->getMaskValue(I); 6619 } 6620 bool IsOp1Undef = 6621 isUndefVector(SV->getOperand(0), 6622 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg)) 6623 .all(); 6624 bool IsOp2Undef = 6625 isUndefVector(SV->getOperand(1), 6626 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg)) 6627 .all(); 6628 if (!IsOp1Undef && !IsOp2Undef) { 6629 // Update mask and mark undef elems. 6630 for (int &I : Mask) { 6631 if (I == PoisonMaskElem) 6632 continue; 6633 if (SV->getMaskValue(I % SV->getShuffleMask().size()) == 6634 PoisonMaskElem) 6635 I = PoisonMaskElem; 6636 } 6637 break; 6638 } 6639 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(), 6640 SV->getShuffleMask().end()); 6641 combineMasks(LocalVF, ShuffleMask, Mask); 6642 Mask.swap(ShuffleMask); 6643 if (IsOp2Undef) 6644 Op = SV->getOperand(0); 6645 else 6646 Op = SV->getOperand(1); 6647 } 6648 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); 6649 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || 6650 ShuffleVectorInst::isZeroEltSplatMask(Mask)) { 6651 if (IdentityOp) { 6652 V = IdentityOp; 6653 assert(Mask.size() == IdentityMask.size() && 6654 "Expected masks of same sizes."); 6655 // Clear known poison elements. 6656 for (auto [I, Idx] : enumerate(Mask)) 6657 if (Idx == PoisonMaskElem) 6658 IdentityMask[I] = PoisonMaskElem; 6659 Mask.swap(IdentityMask); 6660 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V); 6661 return SinglePermute && 6662 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()), 6663 /*IsStrict=*/true) || 6664 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && 6665 Shuffle->isZeroEltSplat() && 6666 ShuffleVectorInst::isZeroEltSplatMask(Mask))); 6667 } 6668 V = Op; 6669 return false; 6670 } 6671 V = Op; 6672 return true; 6673 } 6674 6675 /// Smart shuffle instruction emission, walks through shuffles trees and 6676 /// tries to find the best matching vector for the actual shuffle 6677 /// instruction. 6678 template <typename T, typename ShuffleBuilderTy> 6679 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, 6680 ShuffleBuilderTy &Builder) { 6681 assert(V1 && "Expected at least one vector value."); 6682 if (V2) 6683 Builder.resizeToMatch(V1, V2); 6684 int VF = Mask.size(); 6685 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) 6686 VF = FTy->getNumElements(); 6687 if (V2 && 6688 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { 6689 // Peek through shuffles. 6690 Value *Op1 = V1; 6691 Value *Op2 = V2; 6692 int VF = 6693 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 6694 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 6695 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 6696 for (int I = 0, E = Mask.size(); I < E; ++I) { 6697 if (Mask[I] < VF) 6698 CombinedMask1[I] = Mask[I]; 6699 else 6700 CombinedMask2[I] = Mask[I] - VF; 6701 } 6702 Value *PrevOp1; 6703 Value *PrevOp2; 6704 do { 6705 PrevOp1 = Op1; 6706 PrevOp2 = Op2; 6707 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); 6708 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); 6709 // Check if we have 2 resizing shuffles - need to peek through operands 6710 // again. 6711 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) 6712 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) { 6713 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem); 6714 for (auto [Idx, I] : enumerate(CombinedMask1)) { 6715 if (I == PoisonMaskElem) 6716 continue; 6717 ExtMask1[Idx] = SV1->getMaskValue(I); 6718 } 6719 SmallBitVector UseMask1 = buildUseMask( 6720 cast<FixedVectorType>(SV1->getOperand(1)->getType()) 6721 ->getNumElements(), 6722 ExtMask1, UseMask::SecondArg); 6723 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem); 6724 for (auto [Idx, I] : enumerate(CombinedMask2)) { 6725 if (I == PoisonMaskElem) 6726 continue; 6727 ExtMask2[Idx] = SV2->getMaskValue(I); 6728 } 6729 SmallBitVector UseMask2 = buildUseMask( 6730 cast<FixedVectorType>(SV2->getOperand(1)->getType()) 6731 ->getNumElements(), 6732 ExtMask2, UseMask::SecondArg); 6733 if (SV1->getOperand(0)->getType() == 6734 SV2->getOperand(0)->getType() && 6735 SV1->getOperand(0)->getType() != SV1->getType() && 6736 isUndefVector(SV1->getOperand(1), UseMask1).all() && 6737 isUndefVector(SV2->getOperand(1), UseMask2).all()) { 6738 Op1 = SV1->getOperand(0); 6739 Op2 = SV2->getOperand(0); 6740 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(), 6741 SV1->getShuffleMask().end()); 6742 int LocalVF = ShuffleMask1.size(); 6743 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType())) 6744 LocalVF = FTy->getNumElements(); 6745 combineMasks(LocalVF, ShuffleMask1, CombinedMask1); 6746 CombinedMask1.swap(ShuffleMask1); 6747 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(), 6748 SV2->getShuffleMask().end()); 6749 LocalVF = ShuffleMask2.size(); 6750 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType())) 6751 LocalVF = FTy->getNumElements(); 6752 combineMasks(LocalVF, ShuffleMask2, CombinedMask2); 6753 CombinedMask2.swap(ShuffleMask2); 6754 } 6755 } 6756 } while (PrevOp1 != Op1 || PrevOp2 != Op2); 6757 Builder.resizeToMatch(Op1, Op2); 6758 VF = std::max(cast<VectorType>(Op1->getType()) 6759 ->getElementCount() 6760 .getKnownMinValue(), 6761 cast<VectorType>(Op2->getType()) 6762 ->getElementCount() 6763 .getKnownMinValue()); 6764 for (int I = 0, E = Mask.size(); I < E; ++I) { 6765 if (CombinedMask2[I] != PoisonMaskElem) { 6766 assert(CombinedMask1[I] == PoisonMaskElem && 6767 "Expected undefined mask element"); 6768 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); 6769 } 6770 } 6771 const int Limit = CombinedMask1.size() * 2; 6772 if (Op1 == Op2 && Limit == 2 * VF && 6773 all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) && 6774 (ShuffleVectorInst::isIdentityMask(CombinedMask1) || 6775 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) && 6776 isa<ShuffleVectorInst>(Op1) && 6777 cast<ShuffleVectorInst>(Op1)->getShuffleMask() == 6778 ArrayRef(CombinedMask1)))) 6779 return Builder.createIdentity(Op1); 6780 return Builder.createShuffleVector( 6781 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, 6782 CombinedMask1); 6783 } 6784 if (isa<PoisonValue>(V1)) 6785 return Builder.createPoison( 6786 cast<VectorType>(V1->getType())->getElementType(), Mask.size()); 6787 SmallVector<int> NewMask(Mask.begin(), Mask.end()); 6788 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); 6789 assert(V1 && "Expected non-null value after looking through shuffles."); 6790 6791 if (!IsIdentity) 6792 return Builder.createShuffleVector(V1, NewMask); 6793 return Builder.createIdentity(V1); 6794 } 6795 }; 6796 } // namespace 6797 6798 /// Merges shuffle masks and emits final shuffle instruction, if required. It 6799 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 6800 /// when the actual shuffle instruction is generated only if this is actually 6801 /// required. Otherwise, the shuffle instruction emission is delayed till the 6802 /// end of the process, to reduce the number of emitted instructions and further 6803 /// analysis/transformations. 6804 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { 6805 bool IsFinalized = false; 6806 SmallVector<int> CommonMask; 6807 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; 6808 const TargetTransformInfo &TTI; 6809 InstructionCost Cost = 0; 6810 ArrayRef<Value *> VectorizedVals; 6811 BoUpSLP &R; 6812 SmallPtrSetImpl<Value *> &CheckedExtracts; 6813 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6814 6815 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { 6816 if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof)) 6817 return TTI::TCC_Free; 6818 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); 6819 InstructionCost GatherCost = 0; 6820 SmallVector<Value *> Gathers(VL.begin(), VL.end()); 6821 // Improve gather cost for gather of loads, if we can group some of the 6822 // loads into vector loads. 6823 InstructionsState S = getSameOpcode(VL, *R.TLI); 6824 if (VL.size() > 2 && S.getOpcode() == Instruction::Load && 6825 !S.isAltShuffle() && 6826 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && 6827 !isSplat(Gathers)) { 6828 BoUpSLP::ValueSet VectorizedLoads; 6829 unsigned StartIdx = 0; 6830 unsigned VF = VL.size() / 2; 6831 unsigned VectorizedCnt = 0; 6832 unsigned ScatterVectorizeCnt = 0; 6833 const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType()); 6834 for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { 6835 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; 6836 Cnt += VF) { 6837 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 6838 if (!VectorizedLoads.count(Slice.front()) && 6839 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { 6840 SmallVector<Value *> PointerOps; 6841 OrdersType CurrentOrder; 6842 LoadsState LS = 6843 canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE, 6844 *R.LI, *R.TLI, CurrentOrder, PointerOps); 6845 switch (LS) { 6846 case LoadsState::Vectorize: 6847 case LoadsState::ScatterVectorize: 6848 // Mark the vectorized loads so that we don't vectorize them 6849 // again. 6850 if (LS == LoadsState::Vectorize) 6851 ++VectorizedCnt; 6852 else 6853 ++ScatterVectorizeCnt; 6854 VectorizedLoads.insert(Slice.begin(), Slice.end()); 6855 // If we vectorized initial block, no need to try to vectorize 6856 // it again. 6857 if (Cnt == StartIdx) 6858 StartIdx += VF; 6859 break; 6860 case LoadsState::Gather: 6861 break; 6862 } 6863 } 6864 } 6865 // Check if the whole array was vectorized already - exit. 6866 if (StartIdx >= VL.size()) 6867 break; 6868 // Found vectorizable parts - exit. 6869 if (!VectorizedLoads.empty()) 6870 break; 6871 } 6872 if (!VectorizedLoads.empty()) { 6873 unsigned NumParts = TTI.getNumberOfParts(VecTy); 6874 bool NeedInsertSubvectorAnalysis = 6875 !NumParts || (VL.size() / VF) > NumParts; 6876 // Get the cost for gathered loads. 6877 for (unsigned I = 0, End = VL.size(); I < End; I += VF) { 6878 if (VectorizedLoads.contains(VL[I])) 6879 continue; 6880 GatherCost += getBuildVectorCost(VL.slice(I, VF), Root); 6881 } 6882 // Exclude potentially vectorized loads from list of gathered 6883 // scalars. 6884 auto *LI = cast<LoadInst>(S.MainOp); 6885 Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType())); 6886 // The cost for vectorized loads. 6887 InstructionCost ScalarsCost = 0; 6888 for (Value *V : VectorizedLoads) { 6889 auto *LI = cast<LoadInst>(V); 6890 ScalarsCost += 6891 TTI.getMemoryOpCost(Instruction::Load, LI->getType(), 6892 LI->getAlign(), LI->getPointerAddressSpace(), 6893 CostKind, TTI::OperandValueInfo(), LI); 6894 } 6895 auto *LoadTy = FixedVectorType::get(LI->getType(), VF); 6896 Align Alignment = LI->getAlign(); 6897 GatherCost += 6898 VectorizedCnt * 6899 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, 6900 LI->getPointerAddressSpace(), CostKind, 6901 TTI::OperandValueInfo(), LI); 6902 GatherCost += ScatterVectorizeCnt * 6903 TTI.getGatherScatterOpCost( 6904 Instruction::Load, LoadTy, LI->getPointerOperand(), 6905 /*VariableMask=*/false, Alignment, CostKind, LI); 6906 if (NeedInsertSubvectorAnalysis) { 6907 // Add the cost for the subvectors insert. 6908 for (int I = VF, E = VL.size(); I < E; I += VF) 6909 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, 6910 std::nullopt, CostKind, I, LoadTy); 6911 } 6912 GatherCost -= ScalarsCost; 6913 } 6914 } else if (!Root && isSplat(VL)) { 6915 // Found the broadcasting of the single scalar, calculate the cost as 6916 // the broadcast. 6917 const auto *It = 6918 find_if(VL, [](Value *V) { return !isa<UndefValue>(V); }); 6919 assert(It != VL.end() && "Expected at least one non-undef value."); 6920 // Add broadcast for non-identity shuffle only. 6921 bool NeedShuffle = 6922 count(VL, *It) > 1 && 6923 (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof)); 6924 InstructionCost InsertCost = TTI.getVectorInstrCost( 6925 Instruction::InsertElement, VecTy, CostKind, 6926 NeedShuffle ? 0 : std::distance(VL.begin(), It), 6927 PoisonValue::get(VecTy), *It); 6928 return InsertCost + 6929 (NeedShuffle ? TTI.getShuffleCost( 6930 TargetTransformInfo::SK_Broadcast, VecTy, 6931 /*Mask=*/std::nullopt, CostKind, /*Index=*/0, 6932 /*SubTp=*/nullptr, /*Args=*/*It) 6933 : TTI::TCC_Free); 6934 } 6935 return GatherCost + 6936 (all_of(Gathers, UndefValue::classof) 6937 ? TTI::TCC_Free 6938 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); 6939 }; 6940 6941 /// Compute the cost of creating a vector of type \p VecTy containing the 6942 /// extracted values from \p VL. 6943 InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, 6944 TTI::ShuffleKind ShuffleKind) { 6945 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); 6946 unsigned NumOfParts = TTI.getNumberOfParts(VecTy); 6947 6948 if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || 6949 !NumOfParts || VecTy->getNumElements() < NumOfParts) 6950 return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); 6951 6952 bool AllConsecutive = true; 6953 unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; 6954 unsigned Idx = -1; 6955 InstructionCost Cost = 0; 6956 6957 // Process extracts in blocks of EltsPerVector to check if the source vector 6958 // operand can be re-used directly. If not, add the cost of creating a 6959 // shuffle to extract the values into a vector register. 6960 SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem); 6961 for (auto *V : VL) { 6962 ++Idx; 6963 6964 // Reached the start of a new vector registers. 6965 if (Idx % EltsPerVector == 0) { 6966 RegMask.assign(EltsPerVector, PoisonMaskElem); 6967 AllConsecutive = true; 6968 continue; 6969 } 6970 6971 // Need to exclude undefs from analysis. 6972 if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem) 6973 continue; 6974 6975 // Check all extracts for a vector register on the target directly 6976 // extract values in order. 6977 unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); 6978 if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) { 6979 unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); 6980 AllConsecutive &= PrevIdx + 1 == CurrentIdx && 6981 CurrentIdx % EltsPerVector == Idx % EltsPerVector; 6982 RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; 6983 } 6984 6985 if (AllConsecutive) 6986 continue; 6987 6988 // Skip all indices, except for the last index per vector block. 6989 if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) 6990 continue; 6991 6992 // If we have a series of extracts which are not consecutive and hence 6993 // cannot re-use the source vector register directly, compute the shuffle 6994 // cost to extract the vector with EltsPerVector elements. 6995 Cost += TTI.getShuffleCost( 6996 TargetTransformInfo::SK_PermuteSingleSrc, 6997 FixedVectorType::get(VecTy->getElementType(), EltsPerVector), 6998 RegMask); 6999 } 7000 return Cost; 7001 } 7002 7003 class ShuffleCostBuilder { 7004 const TargetTransformInfo &TTI; 7005 7006 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { 7007 int Limit = 2 * VF; 7008 return Mask.empty() || 7009 (VF == Mask.size() && 7010 all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && 7011 ShuffleVectorInst::isIdentityMask(Mask)); 7012 } 7013 7014 public: 7015 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} 7016 ~ShuffleCostBuilder() = default; 7017 InstructionCost createShuffleVector(Value *V1, Value *, 7018 ArrayRef<int> Mask) const { 7019 // Empty mask or identity mask are free. 7020 unsigned VF = 7021 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 7022 if (isEmptyOrIdentity(Mask, VF)) 7023 return TTI::TCC_Free; 7024 return TTI.getShuffleCost( 7025 TTI::SK_PermuteTwoSrc, 7026 FixedVectorType::get( 7027 cast<VectorType>(V1->getType())->getElementType(), Mask.size()), 7028 Mask); 7029 } 7030 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { 7031 // Empty mask or identity mask are free. 7032 if (isEmptyOrIdentity(Mask, Mask.size())) 7033 return TTI::TCC_Free; 7034 return TTI.getShuffleCost( 7035 TTI::SK_PermuteSingleSrc, 7036 FixedVectorType::get( 7037 cast<VectorType>(V1->getType())->getElementType(), Mask.size()), 7038 Mask); 7039 } 7040 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } 7041 InstructionCost createPoison(Type *Ty, unsigned VF) const { 7042 return TTI::TCC_Free; 7043 } 7044 void resizeToMatch(Value *&, Value *&) const {} 7045 }; 7046 7047 /// Smart shuffle instruction emission, walks through shuffles trees and 7048 /// tries to find the best matching vector for the actual shuffle 7049 /// instruction. 7050 InstructionCost 7051 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1, 7052 const PointerUnion<Value *, const TreeEntry *> &P2, 7053 ArrayRef<int> Mask) { 7054 ShuffleCostBuilder Builder(TTI); 7055 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); 7056 unsigned CommonVF = 0; 7057 if (!V1) { 7058 const TreeEntry *E = P1.get<const TreeEntry *>(); 7059 unsigned VF = E->getVectorFactor(); 7060 if (V2) { 7061 unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); 7062 if (V2VF != VF && V2VF == E->Scalars.size()) 7063 VF = E->Scalars.size(); 7064 } else if (!P2.isNull()) { 7065 const TreeEntry *E2 = P2.get<const TreeEntry *>(); 7066 if (E->Scalars.size() == E2->Scalars.size()) 7067 CommonVF = VF = E->Scalars.size(); 7068 } else { 7069 // P2 is empty, check that we have same node + reshuffle (if any). 7070 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { 7071 VF = E->Scalars.size(); 7072 SmallVector<int> CommonMask(Mask.begin(), Mask.end()); 7073 ::addMask(CommonMask, E->getCommonMask()); 7074 V1 = Constant::getNullValue( 7075 FixedVectorType::get(E->Scalars.front()->getType(), VF)); 7076 return BaseShuffleAnalysis::createShuffle<InstructionCost>( 7077 V1, nullptr, CommonMask, Builder); 7078 } 7079 } 7080 V1 = Constant::getNullValue( 7081 FixedVectorType::get(E->Scalars.front()->getType(), VF)); 7082 } 7083 if (!V2 && !P2.isNull()) { 7084 const TreeEntry *E = P2.get<const TreeEntry *>(); 7085 unsigned VF = E->getVectorFactor(); 7086 unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 7087 if (!CommonVF && V1VF == E->Scalars.size()) 7088 CommonVF = E->Scalars.size(); 7089 if (CommonVF) 7090 VF = CommonVF; 7091 V2 = Constant::getNullValue( 7092 FixedVectorType::get(E->Scalars.front()->getType(), VF)); 7093 } 7094 return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask, 7095 Builder); 7096 } 7097 7098 public: 7099 ShuffleCostEstimator(TargetTransformInfo &TTI, 7100 ArrayRef<Value *> VectorizedVals, BoUpSLP &R, 7101 SmallPtrSetImpl<Value *> &CheckedExtracts) 7102 : TTI(TTI), VectorizedVals(VectorizedVals), R(R), 7103 CheckedExtracts(CheckedExtracts) {} 7104 Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask, 7105 TTI::ShuffleKind ShuffleKind) { 7106 if (Mask.empty()) 7107 return nullptr; 7108 Value *VecBase = nullptr; 7109 ArrayRef<Value *> VL = E->Scalars; 7110 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); 7111 // If the resulting type is scalarized, do not adjust the cost. 7112 unsigned VecNumParts = TTI.getNumberOfParts(VecTy); 7113 if (VecNumParts == VecTy->getNumElements()) 7114 return nullptr; 7115 DenseMap<Value *, int> ExtractVectorsTys; 7116 for (auto [I, V] : enumerate(VL)) { 7117 // Ignore non-extractelement scalars. 7118 if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) 7119 continue; 7120 // If all users of instruction are going to be vectorized and this 7121 // instruction itself is not going to be vectorized, consider this 7122 // instruction as dead and remove its cost from the final cost of the 7123 // vectorized tree. 7124 // Also, avoid adjusting the cost for extractelements with multiple uses 7125 // in different graph entries. 7126 const TreeEntry *VE = R.getTreeEntry(V); 7127 if (!CheckedExtracts.insert(V).second || 7128 !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || 7129 (VE && VE != E)) 7130 continue; 7131 auto *EE = cast<ExtractElementInst>(V); 7132 VecBase = EE->getVectorOperand(); 7133 std::optional<unsigned> EEIdx = getExtractIndex(EE); 7134 if (!EEIdx) 7135 continue; 7136 unsigned Idx = *EEIdx; 7137 if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { 7138 auto It = 7139 ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; 7140 It->getSecond() = std::min<int>(It->second, Idx); 7141 } 7142 // Take credit for instruction that will become dead. 7143 if (EE->hasOneUse()) { 7144 Instruction *Ext = EE->user_back(); 7145 if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) { 7146 return isa<GetElementPtrInst>(U); 7147 })) { 7148 // Use getExtractWithExtendCost() to calculate the cost of 7149 // extractelement/ext pair. 7150 Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), 7151 EE->getVectorOperandType(), Idx); 7152 // Add back the cost of s|zext which is subtracted separately. 7153 Cost += TTI.getCastInstrCost( 7154 Ext->getOpcode(), Ext->getType(), EE->getType(), 7155 TTI::getCastContextHint(Ext), CostKind, Ext); 7156 continue; 7157 } 7158 } 7159 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, 7160 Idx); 7161 } 7162 // Add a cost for subvector extracts/inserts if required. 7163 for (const auto &Data : ExtractVectorsTys) { 7164 auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); 7165 unsigned NumElts = VecTy->getNumElements(); 7166 if (Data.second % NumElts == 0) 7167 continue; 7168 if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { 7169 unsigned Idx = (Data.second / NumElts) * NumElts; 7170 unsigned EENumElts = EEVTy->getNumElements(); 7171 if (Idx % NumElts == 0) 7172 continue; 7173 if (Idx + NumElts <= EENumElts) { 7174 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 7175 EEVTy, std::nullopt, CostKind, Idx, VecTy); 7176 } else { 7177 // Need to round up the subvector type vectorization factor to avoid a 7178 // crash in cost model functions. Make SubVT so that Idx + VF of SubVT 7179 // <= EENumElts. 7180 auto *SubVT = 7181 FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); 7182 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 7183 EEVTy, std::nullopt, CostKind, Idx, SubVT); 7184 } 7185 } else { 7186 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, 7187 VecTy, std::nullopt, CostKind, 0, EEVTy); 7188 } 7189 } 7190 // Check that gather of extractelements can be represented as just a 7191 // shuffle of a single/two vectors the scalars are extracted from. 7192 // Found the bunch of extractelement instructions that must be gathered 7193 // into a vector and can be represented as a permutation elements in a 7194 // single input vector or of 2 input vectors. 7195 Cost += computeExtractCost(VL, Mask, ShuffleKind); 7196 return VecBase; 7197 } 7198 void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) { 7199 CommonMask.assign(Mask.begin(), Mask.end()); 7200 InVectors.assign({E1, E2}); 7201 } 7202 void add(const TreeEntry *E1, ArrayRef<int> Mask) { 7203 CommonMask.assign(Mask.begin(), Mask.end()); 7204 InVectors.assign(1, E1); 7205 } 7206 /// Adds another one input vector and the mask for the shuffling. 7207 void add(Value *V1, ArrayRef<int> Mask) { 7208 assert(CommonMask.empty() && InVectors.empty() && 7209 "Expected empty input mask/vectors."); 7210 CommonMask.assign(Mask.begin(), Mask.end()); 7211 InVectors.assign(1, V1); 7212 } 7213 Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { 7214 Cost += getBuildVectorCost(VL, Root); 7215 if (!Root) { 7216 assert(InVectors.empty() && "Unexpected input vectors for buildvector."); 7217 // FIXME: Need to find a way to avoid use of getNullValue here. 7218 SmallVector<Constant *> Vals; 7219 for (Value *V : VL) { 7220 if (isa<UndefValue>(V)) { 7221 Vals.push_back(cast<Constant>(V)); 7222 continue; 7223 } 7224 Vals.push_back(Constant::getNullValue(V->getType())); 7225 } 7226 return ConstantVector::get(Vals); 7227 } 7228 return ConstantVector::getSplat( 7229 ElementCount::getFixed(VL.size()), 7230 Constant::getNullValue(VL.front()->getType())); 7231 } 7232 /// Finalize emission of the shuffles. 7233 InstructionCost 7234 finalize(ArrayRef<int> ExtMask, unsigned VF = 0, 7235 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 7236 IsFinalized = true; 7237 if (Action) { 7238 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); 7239 if (InVectors.size() == 2) { 7240 Cost += createShuffle(Vec, InVectors.back(), CommonMask); 7241 InVectors.pop_back(); 7242 } else { 7243 Cost += createShuffle(Vec, nullptr, CommonMask); 7244 } 7245 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 7246 if (CommonMask[Idx] != PoisonMaskElem) 7247 CommonMask[Idx] = Idx; 7248 assert(VF > 0 && 7249 "Expected vector length for the final value before action."); 7250 Value *V = Vec.dyn_cast<Value *>(); 7251 if (!Vec.isNull() && !V) 7252 V = Constant::getNullValue(FixedVectorType::get( 7253 Vec.get<const TreeEntry *>()->Scalars.front()->getType(), 7254 CommonMask.size())); 7255 Action(V, CommonMask); 7256 } 7257 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); 7258 if (CommonMask.empty()) 7259 return Cost; 7260 int Limit = CommonMask.size() * 2; 7261 if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && 7262 ShuffleVectorInst::isIdentityMask(CommonMask)) 7263 return Cost; 7264 return Cost + 7265 createShuffle(InVectors.front(), 7266 InVectors.size() == 2 ? InVectors.back() : nullptr, 7267 CommonMask); 7268 } 7269 7270 ~ShuffleCostEstimator() { 7271 assert((IsFinalized || CommonMask.empty()) && 7272 "Shuffle construction must be finalized."); 7273 } 7274 }; 7275 7276 InstructionCost 7277 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, 7278 SmallPtrSetImpl<Value *> &CheckedExtracts) { 7279 ArrayRef<Value *> VL = E->Scalars; 7280 7281 Type *ScalarTy = VL[0]->getType(); 7282 if (auto *SI = dyn_cast<StoreInst>(VL[0])) 7283 ScalarTy = SI->getValueOperand()->getType(); 7284 else if (auto *CI = dyn_cast<CmpInst>(VL[0])) 7285 ScalarTy = CI->getOperand(0)->getType(); 7286 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 7287 ScalarTy = IE->getOperand(1)->getType(); 7288 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 7289 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7290 7291 // If we have computed a smaller type for the expression, update VecTy so 7292 // that the costs will be accurate. 7293 if (MinBWs.count(VL[0])) 7294 VecTy = FixedVectorType::get( 7295 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); 7296 unsigned EntryVF = E->getVectorFactor(); 7297 auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); 7298 7299 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 7300 if (E->State == TreeEntry::NeedToGather) { 7301 if (allConstant(VL)) 7302 return 0; 7303 if (isa<InsertElementInst>(VL[0])) 7304 return InstructionCost::getInvalid(); 7305 ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this, 7306 CheckedExtracts); 7307 unsigned VF = E->getVectorFactor(); 7308 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), 7309 E->ReuseShuffleIndices.end()); 7310 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); 7311 // Build a mask out of the reorder indices and reorder scalars per this 7312 // mask. 7313 SmallVector<int> ReorderMask; 7314 inversePermutation(E->ReorderIndices, ReorderMask); 7315 if (!ReorderMask.empty()) 7316 reorderScalars(GatheredScalars, ReorderMask); 7317 SmallVector<int> Mask; 7318 SmallVector<int> ExtractMask; 7319 std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; 7320 std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; 7321 SmallVector<const TreeEntry *> Entries; 7322 Type *ScalarTy = GatheredScalars.front()->getType(); 7323 // Check for gathered extracts. 7324 ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); 7325 SmallVector<Value *> IgnoredVals; 7326 if (UserIgnoreList) 7327 IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); 7328 7329 bool Resized = false; 7330 if (Value *VecBase = Estimator.adjustExtracts( 7331 E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) 7332 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) 7333 if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { 7334 Resized = true; 7335 GatheredScalars.append(VF - GatheredScalars.size(), 7336 PoisonValue::get(ScalarTy)); 7337 } 7338 7339 // Do not try to look for reshuffled loads for gathered loads (they will be 7340 // handled later), for vectorized scalars, and cases, which are definitely 7341 // not profitable (splats and small gather nodes.) 7342 if (ExtractShuffle || E->getOpcode() != Instruction::Load || 7343 E->isAltShuffle() || 7344 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || 7345 isSplat(E->Scalars) || 7346 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) 7347 GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); 7348 if (GatherShuffle) { 7349 assert((Entries.size() == 1 || Entries.size() == 2) && 7350 "Expected shuffle of 1 or 2 entries."); 7351 if (*GatherShuffle == TTI::SK_PermuteSingleSrc && 7352 Entries.front()->isSame(E->Scalars)) { 7353 // Perfect match in the graph, will reuse the previously vectorized 7354 // node. Cost is 0. 7355 LLVM_DEBUG( 7356 dbgs() 7357 << "SLP: perfect diamond match for gather bundle that starts with " 7358 << *VL.front() << ".\n"); 7359 // Restore the mask for previous partially matched values. 7360 for (auto [I, V] : enumerate(E->Scalars)) { 7361 if (isa<PoisonValue>(V)) { 7362 Mask[I] = PoisonMaskElem; 7363 continue; 7364 } 7365 if (Mask[I] == PoisonMaskElem) 7366 Mask[I] = Entries.front()->findLaneForValue(V); 7367 } 7368 Estimator.add(Entries.front(), Mask); 7369 return Estimator.finalize(E->ReuseShuffleIndices); 7370 } 7371 if (!Resized) { 7372 unsigned VF1 = Entries.front()->getVectorFactor(); 7373 unsigned VF2 = Entries.back()->getVectorFactor(); 7374 if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) 7375 GatheredScalars.append(VF - GatheredScalars.size(), 7376 PoisonValue::get(ScalarTy)); 7377 } 7378 // Remove shuffled elements from list of gathers. 7379 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 7380 if (Mask[I] != PoisonMaskElem) 7381 GatheredScalars[I] = PoisonValue::get(ScalarTy); 7382 } 7383 LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() 7384 << " entries for bundle that starts with " 7385 << *VL.front() << ".\n";); 7386 if (Entries.size() == 1) 7387 Estimator.add(Entries.front(), Mask); 7388 else 7389 Estimator.add(Entries.front(), Entries.back(), Mask); 7390 if (all_of(GatheredScalars, PoisonValue ::classof)) 7391 return Estimator.finalize(E->ReuseShuffleIndices); 7392 return Estimator.finalize( 7393 E->ReuseShuffleIndices, E->Scalars.size(), 7394 [&](Value *&Vec, SmallVectorImpl<int> &Mask) { 7395 Vec = Estimator.gather(GatheredScalars, 7396 Constant::getNullValue(FixedVectorType::get( 7397 GatheredScalars.front()->getType(), 7398 GatheredScalars.size()))); 7399 }); 7400 } 7401 if (!all_of(GatheredScalars, PoisonValue::classof)) { 7402 auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size()); 7403 bool SameGathers = VL.equals(Gathers); 7404 Value *BV = Estimator.gather( 7405 Gathers, SameGathers ? nullptr 7406 : Constant::getNullValue(FixedVectorType::get( 7407 GatheredScalars.front()->getType(), 7408 GatheredScalars.size()))); 7409 SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem); 7410 std::iota(ReuseMask.begin(), ReuseMask.end(), 0); 7411 Estimator.add(BV, ReuseMask); 7412 } 7413 if (ExtractShuffle) 7414 Estimator.add(E, std::nullopt); 7415 return Estimator.finalize(E->ReuseShuffleIndices); 7416 } 7417 InstructionCost CommonCost = 0; 7418 SmallVector<int> Mask; 7419 if (!E->ReorderIndices.empty()) { 7420 SmallVector<int> NewMask; 7421 if (E->getOpcode() == Instruction::Store) { 7422 // For stores the order is actually a mask. 7423 NewMask.resize(E->ReorderIndices.size()); 7424 copy(E->ReorderIndices, NewMask.begin()); 7425 } else { 7426 inversePermutation(E->ReorderIndices, NewMask); 7427 } 7428 ::addMask(Mask, NewMask); 7429 } 7430 if (NeedToShuffleReuses) 7431 ::addMask(Mask, E->ReuseShuffleIndices); 7432 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) 7433 CommonCost = 7434 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); 7435 assert((E->State == TreeEntry::Vectorize || 7436 E->State == TreeEntry::ScatterVectorize) && 7437 "Unhandled state"); 7438 assert(E->getOpcode() && 7439 ((allSameType(VL) && allSameBlock(VL)) || 7440 (E->getOpcode() == Instruction::GetElementPtr && 7441 E->getMainOp()->getType()->isPointerTy())) && 7442 "Invalid VL"); 7443 Instruction *VL0 = E->getMainOp(); 7444 unsigned ShuffleOrOp = 7445 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 7446 const unsigned Sz = VL.size(); 7447 auto GetCostDiff = 7448 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, 7449 function_ref<InstructionCost(InstructionCost)> VectorCost) { 7450 // Calculate the cost of this instruction. 7451 InstructionCost ScalarCost = 0; 7452 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) { 7453 // For some of the instructions no need to calculate cost for each 7454 // particular instruction, we can use the cost of the single 7455 // instruction x total number of scalar instructions. 7456 ScalarCost = Sz * ScalarEltCost(0); 7457 } else { 7458 for (unsigned I = 0; I < Sz; ++I) 7459 ScalarCost += ScalarEltCost(I); 7460 } 7461 7462 InstructionCost VecCost = VectorCost(CommonCost); 7463 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, 7464 ScalarCost, "Calculated costs for Tree")); 7465 return VecCost - ScalarCost; 7466 }; 7467 // Calculate cost difference from vectorizing set of GEPs. 7468 // Negative value means vectorizing is profitable. 7469 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { 7470 InstructionCost ScalarCost = 0; 7471 InstructionCost VecCost = 0; 7472 // Here we differentiate two cases: (1) when Ptrs represent a regular 7473 // vectorization tree node (as they are pointer arguments of scattered 7474 // loads) or (2) when Ptrs are the arguments of loads or stores being 7475 // vectorized as plane wide unit-stride load/store since all the 7476 // loads/stores are known to be from/to adjacent locations. 7477 assert(E->State == TreeEntry::Vectorize && 7478 "Entry state expected to be Vectorize here."); 7479 if (isa<LoadInst, StoreInst>(VL0)) { 7480 // Case 2: estimate costs for pointer related costs when vectorizing to 7481 // a wide load/store. 7482 // Scalar cost is estimated as a set of pointers with known relationship 7483 // between them. 7484 // For vector code we will use BasePtr as argument for the wide load/store 7485 // but we also need to account all the instructions which are going to 7486 // stay in vectorized code due to uses outside of these scalar 7487 // loads/stores. 7488 ScalarCost = TTI->getPointersChainCost( 7489 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, 7490 CostKind); 7491 7492 SmallVector<const Value *> PtrsRetainedInVecCode; 7493 for (Value *V : Ptrs) { 7494 if (V == BasePtr) { 7495 PtrsRetainedInVecCode.push_back(V); 7496 continue; 7497 } 7498 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 7499 // For simplicity assume Ptr to stay in vectorized code if it's not a 7500 // GEP instruction. We don't care since it's cost considered free. 7501 // TODO: We should check for any uses outside of vectorizable tree 7502 // rather than just single use. 7503 if (!Ptr || !Ptr->hasOneUse()) 7504 PtrsRetainedInVecCode.push_back(V); 7505 } 7506 7507 if (PtrsRetainedInVecCode.size() == Ptrs.size()) { 7508 // If all pointers stay in vectorized code then we don't have 7509 // any savings on that. 7510 LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost, 7511 "Calculated GEPs cost for Tree")); 7512 return InstructionCost{TTI::TCC_Free}; 7513 } 7514 VecCost = TTI->getPointersChainCost( 7515 PtrsRetainedInVecCode, BasePtr, 7516 TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind); 7517 } else { 7518 // Case 1: Ptrs are the arguments of loads that we are going to transform 7519 // into masked gather load intrinsic. 7520 // All the scalar GEPs will be removed as a result of vectorization. 7521 // For any external uses of some lanes extract element instructions will 7522 // be generated (which cost is estimated separately). 7523 TTI::PointersChainInfo PtrsInfo = 7524 all_of(Ptrs, 7525 [](const Value *V) { 7526 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 7527 return Ptr && !Ptr->hasAllConstantIndices(); 7528 }) 7529 ? TTI::PointersChainInfo::getUnknownStride() 7530 : TTI::PointersChainInfo::getKnownStride(); 7531 7532 ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, 7533 CostKind); 7534 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) { 7535 SmallVector<const Value *> Indices(BaseGEP->indices()); 7536 VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(), 7537 BaseGEP->getPointerOperand(), Indices, VecTy, 7538 CostKind); 7539 } 7540 } 7541 7542 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, 7543 "Calculated GEPs cost for Tree")); 7544 7545 return VecCost - ScalarCost; 7546 }; 7547 7548 switch (ShuffleOrOp) { 7549 case Instruction::PHI: { 7550 // Count reused scalars. 7551 InstructionCost ScalarCost = 0; 7552 SmallPtrSet<const TreeEntry *, 4> CountedOps; 7553 for (Value *V : VL) { 7554 auto *PHI = dyn_cast<PHINode>(V); 7555 if (!PHI) 7556 continue; 7557 7558 ValueList Operands(PHI->getNumIncomingValues(), nullptr); 7559 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { 7560 Value *Op = PHI->getIncomingValue(I); 7561 Operands[I] = Op; 7562 } 7563 if (const TreeEntry *OpTE = getTreeEntry(Operands.front())) 7564 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second) 7565 if (!OpTE->ReuseShuffleIndices.empty()) 7566 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - 7567 OpTE->Scalars.size()); 7568 } 7569 7570 return CommonCost - ScalarCost; 7571 } 7572 case Instruction::ExtractValue: 7573 case Instruction::ExtractElement: { 7574 auto GetScalarCost = [=](unsigned Idx) { 7575 auto *I = cast<Instruction>(VL[Idx]); 7576 VectorType *SrcVecTy; 7577 if (ShuffleOrOp == Instruction::ExtractElement) { 7578 auto *EE = cast<ExtractElementInst>(I); 7579 SrcVecTy = EE->getVectorOperandType(); 7580 } else { 7581 auto *EV = cast<ExtractValueInst>(I); 7582 Type *AggregateTy = EV->getAggregateOperand()->getType(); 7583 unsigned NumElts; 7584 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy)) 7585 NumElts = ATy->getNumElements(); 7586 else 7587 NumElts = AggregateTy->getStructNumElements(); 7588 SrcVecTy = FixedVectorType::get(ScalarTy, NumElts); 7589 } 7590 if (I->hasOneUse()) { 7591 Instruction *Ext = I->user_back(); 7592 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 7593 all_of(Ext->users(), 7594 [](User *U) { return isa<GetElementPtrInst>(U); })) { 7595 // Use getExtractWithExtendCost() to calculate the cost of 7596 // extractelement/ext pair. 7597 InstructionCost Cost = TTI->getExtractWithExtendCost( 7598 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I)); 7599 // Subtract the cost of s|zext which is subtracted separately. 7600 Cost -= TTI->getCastInstrCost( 7601 Ext->getOpcode(), Ext->getType(), I->getType(), 7602 TTI::getCastContextHint(Ext), CostKind, Ext); 7603 return Cost; 7604 } 7605 } 7606 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, 7607 CostKind, *getExtractIndex(I)); 7608 }; 7609 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; 7610 return GetCostDiff(GetScalarCost, GetVectorCost); 7611 } 7612 case Instruction::InsertElement: { 7613 assert(E->ReuseShuffleIndices.empty() && 7614 "Unique insertelements only are expected."); 7615 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); 7616 unsigned const NumElts = SrcVecTy->getNumElements(); 7617 unsigned const NumScalars = VL.size(); 7618 7619 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); 7620 7621 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 7622 unsigned OffsetBeg = *getInsertIndex(VL.front()); 7623 unsigned OffsetEnd = OffsetBeg; 7624 InsertMask[OffsetBeg] = 0; 7625 for (auto [I, V] : enumerate(VL.drop_front())) { 7626 unsigned Idx = *getInsertIndex(V); 7627 if (OffsetBeg > Idx) 7628 OffsetBeg = Idx; 7629 else if (OffsetEnd < Idx) 7630 OffsetEnd = Idx; 7631 InsertMask[Idx] = I + 1; 7632 } 7633 unsigned VecScalarsSz = PowerOf2Ceil(NumElts); 7634 if (NumOfParts > 0) 7635 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); 7636 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * 7637 VecScalarsSz; 7638 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); 7639 unsigned InsertVecSz = std::min<unsigned>( 7640 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), 7641 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); 7642 bool IsWholeSubvector = 7643 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); 7644 // Check if we can safely insert a subvector. If it is not possible, just 7645 // generate a whole-sized vector and shuffle the source vector and the new 7646 // subvector. 7647 if (OffsetBeg + InsertVecSz > VecSz) { 7648 // Align OffsetBeg to generate correct mask. 7649 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); 7650 InsertVecSz = VecSz; 7651 } 7652 7653 APInt DemandedElts = APInt::getZero(NumElts); 7654 // TODO: Add support for Instruction::InsertValue. 7655 SmallVector<int> Mask; 7656 if (!E->ReorderIndices.empty()) { 7657 inversePermutation(E->ReorderIndices, Mask); 7658 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem); 7659 } else { 7660 Mask.assign(VecSz, PoisonMaskElem); 7661 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); 7662 } 7663 bool IsIdentity = true; 7664 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem); 7665 Mask.swap(PrevMask); 7666 for (unsigned I = 0; I < NumScalars; ++I) { 7667 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); 7668 DemandedElts.setBit(InsertIdx); 7669 IsIdentity &= InsertIdx - OffsetBeg == I; 7670 Mask[InsertIdx - OffsetBeg] = I; 7671 } 7672 assert(Offset < NumElts && "Failed to find vector index offset"); 7673 7674 InstructionCost Cost = 0; 7675 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, 7676 /*Insert*/ true, /*Extract*/ false, 7677 CostKind); 7678 7679 // First cost - resize to actual vector size if not identity shuffle or 7680 // need to shift the vector. 7681 // Do not calculate the cost if the actual size is the register size and 7682 // we can merge this shuffle with the following SK_Select. 7683 auto *InsertVecTy = 7684 FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); 7685 if (!IsIdentity) 7686 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 7687 InsertVecTy, Mask); 7688 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 7689 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 7690 })); 7691 // Second cost - permutation with subvector, if some elements are from the 7692 // initial vector or inserting a subvector. 7693 // TODO: Implement the analysis of the FirstInsert->getOperand(0) 7694 // subvector of ActualVecTy. 7695 SmallBitVector InMask = 7696 isUndefVector(FirstInsert->getOperand(0), 7697 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); 7698 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { 7699 if (InsertVecSz != VecSz) { 7700 auto *ActualVecTy = 7701 FixedVectorType::get(SrcVecTy->getElementType(), VecSz); 7702 Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, 7703 std::nullopt, CostKind, OffsetBeg - Offset, 7704 InsertVecTy); 7705 } else { 7706 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) 7707 Mask[I] = InMask.test(I) ? PoisonMaskElem : I; 7708 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; 7709 I <= End; ++I) 7710 if (Mask[I] != PoisonMaskElem) 7711 Mask[I] = I + VecSz; 7712 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) 7713 Mask[I] = 7714 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; 7715 Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); 7716 } 7717 } 7718 return Cost; 7719 } 7720 case Instruction::ZExt: 7721 case Instruction::SExt: 7722 case Instruction::FPToUI: 7723 case Instruction::FPToSI: 7724 case Instruction::FPExt: 7725 case Instruction::PtrToInt: 7726 case Instruction::IntToPtr: 7727 case Instruction::SIToFP: 7728 case Instruction::UIToFP: 7729 case Instruction::Trunc: 7730 case Instruction::FPTrunc: 7731 case Instruction::BitCast: { 7732 auto GetScalarCost = [=](unsigned Idx) { 7733 auto *VI = cast<Instruction>(VL[Idx]); 7734 return TTI->getCastInstrCost(E->getOpcode(), ScalarTy, 7735 VI->getOperand(0)->getType(), 7736 TTI::getCastContextHint(VI), CostKind, VI); 7737 }; 7738 auto GetVectorCost = [=](InstructionCost CommonCost) { 7739 Type *SrcTy = VL0->getOperand(0)->getType(); 7740 auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); 7741 InstructionCost VecCost = CommonCost; 7742 // Check if the values are candidates to demote. 7743 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) 7744 VecCost += 7745 TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, 7746 TTI::getCastContextHint(VL0), CostKind, VL0); 7747 return VecCost; 7748 }; 7749 return GetCostDiff(GetScalarCost, GetVectorCost); 7750 } 7751 case Instruction::FCmp: 7752 case Instruction::ICmp: 7753 case Instruction::Select: { 7754 CmpInst::Predicate VecPred, SwappedVecPred; 7755 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); 7756 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || 7757 match(VL0, MatchCmp)) 7758 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); 7759 else 7760 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy() 7761 ? CmpInst::BAD_FCMP_PREDICATE 7762 : CmpInst::BAD_ICMP_PREDICATE; 7763 auto GetScalarCost = [&](unsigned Idx) { 7764 auto *VI = cast<Instruction>(VL[Idx]); 7765 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() 7766 ? CmpInst::BAD_FCMP_PREDICATE 7767 : CmpInst::BAD_ICMP_PREDICATE; 7768 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); 7769 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && 7770 !match(VI, MatchCmp)) || 7771 (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) 7772 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() 7773 ? CmpInst::BAD_FCMP_PREDICATE 7774 : CmpInst::BAD_ICMP_PREDICATE; 7775 7776 return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, 7777 Builder.getInt1Ty(), CurrentPred, CostKind, 7778 VI); 7779 }; 7780 auto GetVectorCost = [&](InstructionCost CommonCost) { 7781 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); 7782 7783 InstructionCost VecCost = TTI->getCmpSelInstrCost( 7784 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); 7785 // Check if it is possible and profitable to use min/max for selects 7786 // in VL. 7787 // 7788 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); 7789 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { 7790 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, 7791 {VecTy, VecTy}); 7792 InstructionCost IntrinsicCost = 7793 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 7794 // If the selects are the only uses of the compares, they will be 7795 // dead and we can adjust the cost by removing their cost. 7796 if (IntrinsicAndUse.second) 7797 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, 7798 MaskTy, VecPred, CostKind); 7799 VecCost = std::min(VecCost, IntrinsicCost); 7800 } 7801 return VecCost + CommonCost; 7802 }; 7803 return GetCostDiff(GetScalarCost, GetVectorCost); 7804 } 7805 case Instruction::FNeg: 7806 case Instruction::Add: 7807 case Instruction::FAdd: 7808 case Instruction::Sub: 7809 case Instruction::FSub: 7810 case Instruction::Mul: 7811 case Instruction::FMul: 7812 case Instruction::UDiv: 7813 case Instruction::SDiv: 7814 case Instruction::FDiv: 7815 case Instruction::URem: 7816 case Instruction::SRem: 7817 case Instruction::FRem: 7818 case Instruction::Shl: 7819 case Instruction::LShr: 7820 case Instruction::AShr: 7821 case Instruction::And: 7822 case Instruction::Or: 7823 case Instruction::Xor: { 7824 auto GetScalarCost = [=](unsigned Idx) { 7825 auto *VI = cast<Instruction>(VL[Idx]); 7826 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1; 7827 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); 7828 TTI::OperandValueInfo Op2Info = 7829 TTI::getOperandInfo(VI->getOperand(OpIdx)); 7830 SmallVector<const Value *> Operands(VI->operand_values()); 7831 return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind, 7832 Op1Info, Op2Info, Operands, VI); 7833 }; 7834 auto GetVectorCost = [=](InstructionCost CommonCost) { 7835 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; 7836 TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0); 7837 TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx); 7838 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, 7839 Op2Info) + 7840 CommonCost; 7841 }; 7842 return GetCostDiff(GetScalarCost, GetVectorCost); 7843 } 7844 case Instruction::GetElementPtr: { 7845 return CommonCost + GetGEPCostDiff(VL, VL0); 7846 } 7847 case Instruction::Load: { 7848 auto GetScalarCost = [=](unsigned Idx) { 7849 auto *VI = cast<LoadInst>(VL[Idx]); 7850 return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), 7851 VI->getPointerAddressSpace(), CostKind, 7852 TTI::OperandValueInfo(), VI); 7853 }; 7854 auto *LI0 = cast<LoadInst>(VL0); 7855 auto GetVectorCost = [=](InstructionCost CommonCost) { 7856 InstructionCost VecLdCost; 7857 if (E->State == TreeEntry::Vectorize) { 7858 VecLdCost = TTI->getMemoryOpCost( 7859 Instruction::Load, VecTy, LI0->getAlign(), 7860 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); 7861 } else { 7862 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); 7863 Align CommonAlignment = LI0->getAlign(); 7864 for (Value *V : VL) 7865 CommonAlignment = 7866 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); 7867 VecLdCost = TTI->getGatherScatterOpCost( 7868 Instruction::Load, VecTy, LI0->getPointerOperand(), 7869 /*VariableMask=*/false, CommonAlignment, CostKind); 7870 } 7871 return VecLdCost + CommonCost; 7872 }; 7873 7874 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); 7875 // If this node generates masked gather load then it is not a terminal node. 7876 // Hence address operand cost is estimated separately. 7877 if (E->State == TreeEntry::ScatterVectorize) 7878 return Cost; 7879 7880 // Estimate cost of GEPs since this tree node is a terminator. 7881 SmallVector<Value *> PointerOps(VL.size()); 7882 for (auto [I, V] : enumerate(VL)) 7883 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); 7884 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); 7885 } 7886 case Instruction::Store: { 7887 bool IsReorder = !E->ReorderIndices.empty(); 7888 auto GetScalarCost = [=](unsigned Idx) { 7889 auto *VI = cast<StoreInst>(VL[Idx]); 7890 TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0); 7891 return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(), 7892 VI->getPointerAddressSpace(), CostKind, 7893 OpInfo, VI); 7894 }; 7895 auto *BaseSI = 7896 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); 7897 auto GetVectorCost = [=](InstructionCost CommonCost) { 7898 // We know that we can merge the stores. Calculate the cost. 7899 TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0); 7900 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), 7901 BaseSI->getPointerAddressSpace(), CostKind, 7902 OpInfo) + 7903 CommonCost; 7904 }; 7905 SmallVector<Value *> PointerOps(VL.size()); 7906 for (auto [I, V] : enumerate(VL)) { 7907 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; 7908 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand(); 7909 } 7910 7911 return GetCostDiff(GetScalarCost, GetVectorCost) + 7912 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); 7913 } 7914 case Instruction::Call: { 7915 auto GetScalarCost = [=](unsigned Idx) { 7916 auto *CI = cast<CallInst>(VL[Idx]); 7917 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7918 if (ID != Intrinsic::not_intrinsic) { 7919 IntrinsicCostAttributes CostAttrs(ID, *CI, 1); 7920 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 7921 } 7922 return TTI->getCallInstrCost(CI->getCalledFunction(), 7923 CI->getFunctionType()->getReturnType(), 7924 CI->getFunctionType()->params(), CostKind); 7925 }; 7926 auto GetVectorCost = [=](InstructionCost CommonCost) { 7927 auto *CI = cast<CallInst>(VL0); 7928 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 7929 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; 7930 }; 7931 return GetCostDiff(GetScalarCost, GetVectorCost); 7932 } 7933 case Instruction::ShuffleVector: { 7934 assert(E->isAltShuffle() && 7935 ((Instruction::isBinaryOp(E->getOpcode()) && 7936 Instruction::isBinaryOp(E->getAltOpcode())) || 7937 (Instruction::isCast(E->getOpcode()) && 7938 Instruction::isCast(E->getAltOpcode())) || 7939 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 7940 "Invalid Shuffle Vector Operand"); 7941 // Try to find the previous shuffle node with the same operands and same 7942 // main/alternate ops. 7943 auto TryFindNodeWithEqualOperands = [=]() { 7944 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 7945 if (TE.get() == E) 7946 break; 7947 if (TE->isAltShuffle() && 7948 ((TE->getOpcode() == E->getOpcode() && 7949 TE->getAltOpcode() == E->getAltOpcode()) || 7950 (TE->getOpcode() == E->getAltOpcode() && 7951 TE->getAltOpcode() == E->getOpcode())) && 7952 TE->hasEqualOperands(*E)) 7953 return true; 7954 } 7955 return false; 7956 }; 7957 auto GetScalarCost = [=](unsigned Idx) { 7958 auto *VI = cast<Instruction>(VL[Idx]); 7959 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); 7960 (void)E; 7961 return TTI->getInstructionCost(VI, CostKind); 7962 }; 7963 // Need to clear CommonCost since the final shuffle cost is included into 7964 // vector cost. 7965 auto GetVectorCost = [&](InstructionCost) { 7966 // VecCost is equal to sum of the cost of creating 2 vectors 7967 // and the cost of creating shuffle. 7968 InstructionCost VecCost = 0; 7969 if (TryFindNodeWithEqualOperands()) { 7970 LLVM_DEBUG({ 7971 dbgs() << "SLP: diamond match for alternate node found.\n"; 7972 E->dump(); 7973 }); 7974 // No need to add new vector costs here since we're going to reuse 7975 // same main/alternate vector ops, just do different shuffling. 7976 } else if (Instruction::isBinaryOp(E->getOpcode())) { 7977 VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); 7978 VecCost += 7979 TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); 7980 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 7981 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); 7982 VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, 7983 CI0->getPredicate(), CostKind, VL0); 7984 VecCost += TTI->getCmpSelInstrCost( 7985 E->getOpcode(), VecTy, MaskTy, 7986 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, 7987 E->getAltOp()); 7988 } else { 7989 Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); 7990 Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); 7991 auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); 7992 auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); 7993 VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, 7994 TTI::CastContextHint::None, CostKind); 7995 VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, 7996 TTI::CastContextHint::None, CostKind); 7997 } 7998 if (E->ReuseShuffleIndices.empty()) { 7999 VecCost += 8000 TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); 8001 } else { 8002 SmallVector<int> Mask; 8003 buildShuffleEntryMask( 8004 E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, 8005 [E](Instruction *I) { 8006 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 8007 return I->getOpcode() == E->getAltOpcode(); 8008 }, 8009 Mask); 8010 VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, 8011 FinalVecTy, Mask); 8012 } 8013 return VecCost; 8014 }; 8015 return GetCostDiff(GetScalarCost, GetVectorCost); 8016 } 8017 default: 8018 llvm_unreachable("Unknown instruction"); 8019 } 8020 } 8021 8022 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { 8023 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " 8024 << VectorizableTree.size() << " is fully vectorizable .\n"); 8025 8026 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { 8027 SmallVector<int> Mask; 8028 return TE->State == TreeEntry::NeedToGather && 8029 !any_of(TE->Scalars, 8030 [this](Value *V) { return EphValues.contains(V); }) && 8031 (allConstant(TE->Scalars) || isSplat(TE->Scalars) || 8032 TE->Scalars.size() < Limit || 8033 ((TE->getOpcode() == Instruction::ExtractElement || 8034 all_of(TE->Scalars, 8035 [](Value *V) { 8036 return isa<ExtractElementInst, UndefValue>(V); 8037 })) && 8038 isFixedVectorShuffle(TE->Scalars, Mask)) || 8039 (TE->State == TreeEntry::NeedToGather && 8040 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); 8041 }; 8042 8043 // We only handle trees of heights 1 and 2. 8044 if (VectorizableTree.size() == 1 && 8045 (VectorizableTree[0]->State == TreeEntry::Vectorize || 8046 (ForReduction && 8047 AreVectorizableGathers(VectorizableTree[0].get(), 8048 VectorizableTree[0]->Scalars.size()) && 8049 VectorizableTree[0]->getVectorFactor() > 2))) 8050 return true; 8051 8052 if (VectorizableTree.size() != 2) 8053 return false; 8054 8055 // Handle splat and all-constants stores. Also try to vectorize tiny trees 8056 // with the second gather nodes if they have less scalar operands rather than 8057 // the initial tree element (may be profitable to shuffle the second gather) 8058 // or they are extractelements, which form shuffle. 8059 SmallVector<int> Mask; 8060 if (VectorizableTree[0]->State == TreeEntry::Vectorize && 8061 AreVectorizableGathers(VectorizableTree[1].get(), 8062 VectorizableTree[0]->Scalars.size())) 8063 return true; 8064 8065 // Gathering cost would be too much for tiny trees. 8066 if (VectorizableTree[0]->State == TreeEntry::NeedToGather || 8067 (VectorizableTree[1]->State == TreeEntry::NeedToGather && 8068 VectorizableTree[0]->State != TreeEntry::ScatterVectorize)) 8069 return false; 8070 8071 return true; 8072 } 8073 8074 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, 8075 TargetTransformInfo *TTI, 8076 bool MustMatchOrInst) { 8077 // Look past the root to find a source value. Arbitrarily follow the 8078 // path through operand 0 of any 'or'. Also, peek through optional 8079 // shift-left-by-multiple-of-8-bits. 8080 Value *ZextLoad = Root; 8081 const APInt *ShAmtC; 8082 bool FoundOr = false; 8083 while (!isa<ConstantExpr>(ZextLoad) && 8084 (match(ZextLoad, m_Or(m_Value(), m_Value())) || 8085 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && 8086 ShAmtC->urem(8) == 0))) { 8087 auto *BinOp = cast<BinaryOperator>(ZextLoad); 8088 ZextLoad = BinOp->getOperand(0); 8089 if (BinOp->getOpcode() == Instruction::Or) 8090 FoundOr = true; 8091 } 8092 // Check if the input is an extended load of the required or/shift expression. 8093 Value *Load; 8094 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || 8095 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) 8096 return false; 8097 8098 // Require that the total load bit width is a legal integer type. 8099 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 8100 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 8101 Type *SrcTy = Load->getType(); 8102 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; 8103 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) 8104 return false; 8105 8106 // Everything matched - assume that we can fold the whole sequence using 8107 // load combining. 8108 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " 8109 << *(cast<Instruction>(Root)) << "\n"); 8110 8111 return true; 8112 } 8113 8114 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { 8115 if (RdxKind != RecurKind::Or) 8116 return false; 8117 8118 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 8119 Value *FirstReduced = VectorizableTree[0]->Scalars[0]; 8120 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, 8121 /* MatchOr */ false); 8122 } 8123 8124 bool BoUpSLP::isLoadCombineCandidate() const { 8125 // Peek through a final sequence of stores and check if all operations are 8126 // likely to be load-combined. 8127 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 8128 for (Value *Scalar : VectorizableTree[0]->Scalars) { 8129 Value *X; 8130 if (!match(Scalar, m_Store(m_Value(X), m_Value())) || 8131 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) 8132 return false; 8133 } 8134 return true; 8135 } 8136 8137 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { 8138 // No need to vectorize inserts of gathered values. 8139 if (VectorizableTree.size() == 2 && 8140 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && 8141 VectorizableTree[1]->State == TreeEntry::NeedToGather && 8142 (VectorizableTree[1]->getVectorFactor() <= 2 || 8143 !(isSplat(VectorizableTree[1]->Scalars) || 8144 allConstant(VectorizableTree[1]->Scalars)))) 8145 return true; 8146 8147 // We can vectorize the tree if its size is greater than or equal to the 8148 // minimum size specified by the MinTreeSize command line option. 8149 if (VectorizableTree.size() >= MinTreeSize) 8150 return false; 8151 8152 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we 8153 // can vectorize it if we can prove it fully vectorizable. 8154 if (isFullyVectorizableTinyTree(ForReduction)) 8155 return false; 8156 8157 assert(VectorizableTree.empty() 8158 ? ExternalUses.empty() 8159 : true && "We shouldn't have any external users"); 8160 8161 // Otherwise, we can't vectorize the tree. It is both tiny and not fully 8162 // vectorizable. 8163 return true; 8164 } 8165 8166 InstructionCost BoUpSLP::getSpillCost() const { 8167 // Walk from the bottom of the tree to the top, tracking which values are 8168 // live. When we see a call instruction that is not part of our tree, 8169 // query TTI to see if there is a cost to keeping values live over it 8170 // (for example, if spills and fills are required). 8171 unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); 8172 InstructionCost Cost = 0; 8173 8174 SmallPtrSet<Instruction *, 4> LiveValues; 8175 Instruction *PrevInst = nullptr; 8176 8177 // The entries in VectorizableTree are not necessarily ordered by their 8178 // position in basic blocks. Collect them and order them by dominance so later 8179 // instructions are guaranteed to be visited first. For instructions in 8180 // different basic blocks, we only scan to the beginning of the block, so 8181 // their order does not matter, as long as all instructions in a basic block 8182 // are grouped together. Using dominance ensures a deterministic order. 8183 SmallVector<Instruction *, 16> OrderedScalars; 8184 for (const auto &TEPtr : VectorizableTree) { 8185 if (TEPtr->State != TreeEntry::Vectorize) 8186 continue; 8187 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); 8188 if (!Inst) 8189 continue; 8190 OrderedScalars.push_back(Inst); 8191 } 8192 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { 8193 auto *NodeA = DT->getNode(A->getParent()); 8194 auto *NodeB = DT->getNode(B->getParent()); 8195 assert(NodeA && "Should only process reachable instructions"); 8196 assert(NodeB && "Should only process reachable instructions"); 8197 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 8198 "Different nodes should have different DFS numbers"); 8199 if (NodeA != NodeB) 8200 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn(); 8201 return B->comesBefore(A); 8202 }); 8203 8204 for (Instruction *Inst : OrderedScalars) { 8205 if (!PrevInst) { 8206 PrevInst = Inst; 8207 continue; 8208 } 8209 8210 // Update LiveValues. 8211 LiveValues.erase(PrevInst); 8212 for (auto &J : PrevInst->operands()) { 8213 if (isa<Instruction>(&*J) && getTreeEntry(&*J)) 8214 LiveValues.insert(cast<Instruction>(&*J)); 8215 } 8216 8217 LLVM_DEBUG({ 8218 dbgs() << "SLP: #LV: " << LiveValues.size(); 8219 for (auto *X : LiveValues) 8220 dbgs() << " " << X->getName(); 8221 dbgs() << ", Looking at "; 8222 Inst->dump(); 8223 }); 8224 8225 // Now find the sequence of instructions between PrevInst and Inst. 8226 unsigned NumCalls = 0; 8227 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), 8228 PrevInstIt = 8229 PrevInst->getIterator().getReverse(); 8230 while (InstIt != PrevInstIt) { 8231 if (PrevInstIt == PrevInst->getParent()->rend()) { 8232 PrevInstIt = Inst->getParent()->rbegin(); 8233 continue; 8234 } 8235 8236 auto NoCallIntrinsic = [this](Instruction *I) { 8237 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 8238 if (II->isAssumeLikeIntrinsic()) 8239 return true; 8240 FastMathFlags FMF; 8241 SmallVector<Type *, 4> Tys; 8242 for (auto &ArgOp : II->args()) 8243 Tys.push_back(ArgOp->getType()); 8244 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 8245 FMF = FPMO->getFastMathFlags(); 8246 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, 8247 FMF); 8248 InstructionCost IntrCost = 8249 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); 8250 InstructionCost CallCost = TTI->getCallInstrCost( 8251 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); 8252 if (IntrCost < CallCost) 8253 return true; 8254 } 8255 return false; 8256 }; 8257 8258 // Debug information does not impact spill cost. 8259 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && 8260 &*PrevInstIt != PrevInst) 8261 NumCalls++; 8262 8263 ++PrevInstIt; 8264 } 8265 8266 if (NumCalls) { 8267 SmallVector<Type *, 4> V; 8268 for (auto *II : LiveValues) { 8269 auto *ScalarTy = II->getType(); 8270 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) 8271 ScalarTy = VectorTy->getElementType(); 8272 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth)); 8273 } 8274 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); 8275 } 8276 8277 PrevInst = Inst; 8278 } 8279 8280 return Cost; 8281 } 8282 8283 /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the 8284 /// buildvector sequence. 8285 static bool isFirstInsertElement(const InsertElementInst *IE1, 8286 const InsertElementInst *IE2) { 8287 if (IE1 == IE2) 8288 return false; 8289 const auto *I1 = IE1; 8290 const auto *I2 = IE2; 8291 const InsertElementInst *PrevI1; 8292 const InsertElementInst *PrevI2; 8293 unsigned Idx1 = *getInsertIndex(IE1); 8294 unsigned Idx2 = *getInsertIndex(IE2); 8295 do { 8296 if (I2 == IE1) 8297 return true; 8298 if (I1 == IE2) 8299 return false; 8300 PrevI1 = I1; 8301 PrevI2 = I2; 8302 if (I1 && (I1 == IE1 || I1->hasOneUse()) && 8303 getInsertIndex(I1).value_or(Idx2) != Idx2) 8304 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0)); 8305 if (I2 && ((I2 == IE2 || I2->hasOneUse())) && 8306 getInsertIndex(I2).value_or(Idx1) != Idx1) 8307 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0)); 8308 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); 8309 llvm_unreachable("Two different buildvectors not expected."); 8310 } 8311 8312 namespace { 8313 /// Returns incoming Value *, if the requested type is Value * too, or a default 8314 /// value, otherwise. 8315 struct ValueSelect { 8316 template <typename U> 8317 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) { 8318 return V; 8319 } 8320 template <typename U> 8321 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) { 8322 return U(); 8323 } 8324 }; 8325 } // namespace 8326 8327 /// Does the analysis of the provided shuffle masks and performs the requested 8328 /// actions on the vectors with the given shuffle masks. It tries to do it in 8329 /// several steps. 8330 /// 1. If the Base vector is not undef vector, resizing the very first mask to 8331 /// have common VF and perform action for 2 input vectors (including non-undef 8332 /// Base). Other shuffle masks are combined with the resulting after the 1 stage 8333 /// and processed as a shuffle of 2 elements. 8334 /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the 8335 /// action only for 1 vector with the given mask, if it is not the identity 8336 /// mask. 8337 /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 8338 /// vectors, combing the masks properly between the steps. 8339 template <typename T> 8340 static T *performExtractsShuffleAction( 8341 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, 8342 function_ref<unsigned(T *)> GetVF, 8343 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction, 8344 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { 8345 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); 8346 SmallVector<int> Mask(ShuffleMask.begin()->second); 8347 auto VMIt = std::next(ShuffleMask.begin()); 8348 T *Prev = nullptr; 8349 SmallBitVector UseMask = 8350 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask); 8351 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask); 8352 if (!IsBaseUndef.all()) { 8353 // Base is not undef, need to combine it with the next subvectors. 8354 std::pair<T *, bool> Res = 8355 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); 8356 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask); 8357 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { 8358 if (Mask[Idx] == PoisonMaskElem) 8359 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx; 8360 else 8361 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; 8362 } 8363 auto *V = ValueSelect::get<T *>(Base); 8364 (void)V; 8365 assert((!V || GetVF(V) == Mask.size()) && 8366 "Expected base vector of VF number of elements."); 8367 Prev = Action(Mask, {nullptr, Res.first}); 8368 } else if (ShuffleMask.size() == 1) { 8369 // Base is undef and only 1 vector is shuffled - perform the action only for 8370 // single vector, if the mask is not the identity mask. 8371 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask, 8372 /*ForSingleMask=*/true); 8373 if (Res.second) 8374 // Identity mask is found. 8375 Prev = Res.first; 8376 else 8377 Prev = Action(Mask, {ShuffleMask.begin()->first}); 8378 } else { 8379 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors 8380 // shuffles step by step, combining shuffle between the steps. 8381 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); 8382 unsigned Vec2VF = GetVF(VMIt->first); 8383 if (Vec1VF == Vec2VF) { 8384 // No need to resize the input vectors since they are of the same size, we 8385 // can shuffle them directly. 8386 ArrayRef<int> SecMask = VMIt->second; 8387 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 8388 if (SecMask[I] != PoisonMaskElem) { 8389 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 8390 Mask[I] = SecMask[I] + Vec1VF; 8391 } 8392 } 8393 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); 8394 } else { 8395 // Vectors of different sizes - resize and reshuffle. 8396 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, 8397 /*ForSingleMask=*/false); 8398 std::pair<T *, bool> Res2 = 8399 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 8400 ArrayRef<int> SecMask = VMIt->second; 8401 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 8402 if (Mask[I] != PoisonMaskElem) { 8403 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars."); 8404 if (Res1.second) 8405 Mask[I] = I; 8406 } else if (SecMask[I] != PoisonMaskElem) { 8407 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 8408 Mask[I] = (Res2.second ? I : SecMask[I]) + VF; 8409 } 8410 } 8411 Prev = Action(Mask, {Res1.first, Res2.first}); 8412 } 8413 VMIt = std::next(VMIt); 8414 } 8415 bool IsBaseNotUndef = !IsBaseUndef.all(); 8416 (void)IsBaseNotUndef; 8417 // Perform requested actions for the remaining masks/vectors. 8418 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { 8419 // Shuffle other input vectors, if any. 8420 std::pair<T *, bool> Res = 8421 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 8422 ArrayRef<int> SecMask = VMIt->second; 8423 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 8424 if (SecMask[I] != PoisonMaskElem) { 8425 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) && 8426 "Multiple uses of scalars."); 8427 Mask[I] = (Res.second ? I : SecMask[I]) + VF; 8428 } else if (Mask[I] != PoisonMaskElem) { 8429 Mask[I] = I; 8430 } 8431 } 8432 Prev = Action(Mask, {Prev, Res.first}); 8433 } 8434 return Prev; 8435 } 8436 8437 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { 8438 // Build a map for gathered scalars to the nodes where they are used. 8439 ValueToGatherNodes.clear(); 8440 for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { 8441 if (EntryPtr->State != TreeEntry::NeedToGather) 8442 continue; 8443 for (Value *V : EntryPtr->Scalars) 8444 if (!isConstant(V)) 8445 ValueToGatherNodes.try_emplace(V).first->getSecond().insert( 8446 EntryPtr.get()); 8447 } 8448 InstructionCost Cost = 0; 8449 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " 8450 << VectorizableTree.size() << ".\n"); 8451 8452 unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); 8453 8454 SmallPtrSet<Value *, 4> CheckedExtracts; 8455 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { 8456 TreeEntry &TE = *VectorizableTree[I]; 8457 if (TE.State == TreeEntry::NeedToGather) { 8458 if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); 8459 E && E->getVectorFactor() == TE.getVectorFactor() && 8460 E->isSame(TE.Scalars)) { 8461 // Some gather nodes might be absolutely the same as some vectorizable 8462 // nodes after reordering, need to handle it. 8463 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with " 8464 << *TE.Scalars[0] << ".\n" 8465 << "SLP: Current total cost = " << Cost << "\n"); 8466 continue; 8467 } 8468 } 8469 8470 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); 8471 Cost += C; 8472 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 8473 << " for bundle that starts with " << *TE.Scalars[0] 8474 << ".\n" 8475 << "SLP: Current total cost = " << Cost << "\n"); 8476 } 8477 8478 SmallPtrSet<Value *, 16> ExtractCostCalculated; 8479 InstructionCost ExtractCost = 0; 8480 SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks; 8481 SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers; 8482 SmallVector<APInt> DemandedElts; 8483 for (ExternalUser &EU : ExternalUses) { 8484 // We only add extract cost once for the same scalar. 8485 if (!isa_and_nonnull<InsertElementInst>(EU.User) && 8486 !ExtractCostCalculated.insert(EU.Scalar).second) 8487 continue; 8488 8489 // Uses by ephemeral values are free (because the ephemeral value will be 8490 // removed prior to code generation, and so the extraction will be 8491 // removed as well). 8492 if (EphValues.count(EU.User)) 8493 continue; 8494 8495 // No extract cost for vector "scalar" 8496 if (isa<FixedVectorType>(EU.Scalar->getType())) 8497 continue; 8498 8499 // If found user is an insertelement, do not calculate extract cost but try 8500 // to detect it as a final shuffled/identity match. 8501 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { 8502 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { 8503 std::optional<unsigned> InsertIdx = getInsertIndex(VU); 8504 if (InsertIdx) { 8505 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); 8506 auto *It = find_if( 8507 FirstUsers, 8508 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) { 8509 return areTwoInsertFromSameBuildVector( 8510 VU, cast<InsertElementInst>(Pair.first), 8511 [this](InsertElementInst *II) -> Value * { 8512 Value *Op0 = II->getOperand(0); 8513 if (getTreeEntry(II) && !getTreeEntry(Op0)) 8514 return nullptr; 8515 return Op0; 8516 }); 8517 }); 8518 int VecId = -1; 8519 if (It == FirstUsers.end()) { 8520 (void)ShuffleMasks.emplace_back(); 8521 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE]; 8522 if (Mask.empty()) 8523 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 8524 // Find the insertvector, vectorized in tree, if any. 8525 Value *Base = VU; 8526 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { 8527 if (IEBase != EU.User && 8528 (!IEBase->hasOneUse() || 8529 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx)) 8530 break; 8531 // Build the mask for the vectorized insertelement instructions. 8532 if (const TreeEntry *E = getTreeEntry(IEBase)) { 8533 VU = IEBase; 8534 do { 8535 IEBase = cast<InsertElementInst>(Base); 8536 int Idx = *getInsertIndex(IEBase); 8537 assert(Mask[Idx] == PoisonMaskElem && 8538 "InsertElementInstruction used already."); 8539 Mask[Idx] = Idx; 8540 Base = IEBase->getOperand(0); 8541 } while (E == getTreeEntry(Base)); 8542 break; 8543 } 8544 Base = cast<InsertElementInst>(Base)->getOperand(0); 8545 } 8546 FirstUsers.emplace_back(VU, ScalarTE); 8547 DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); 8548 VecId = FirstUsers.size() - 1; 8549 } else { 8550 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first))) 8551 It->first = VU; 8552 VecId = std::distance(FirstUsers.begin(), It); 8553 } 8554 int InIdx = *InsertIdx; 8555 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE]; 8556 if (Mask.empty()) 8557 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 8558 Mask[InIdx] = EU.Lane; 8559 DemandedElts[VecId].setBit(InIdx); 8560 continue; 8561 } 8562 } 8563 } 8564 8565 // If we plan to rewrite the tree in a smaller type, we will need to sign 8566 // extend the extracted value back to the original type. Here, we account 8567 // for the extract and the added cost of the sign extend if needed. 8568 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); 8569 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 8570 auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 8571 if (MinBWs.count(ScalarRoot)) { 8572 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 8573 auto Extend = 8574 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; 8575 VecTy = FixedVectorType::get(MinTy, BundleWidth); 8576 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), 8577 VecTy, EU.Lane); 8578 } else { 8579 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 8580 CostKind, EU.Lane); 8581 } 8582 } 8583 8584 InstructionCost SpillCost = getSpillCost(); 8585 Cost += SpillCost + ExtractCost; 8586 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask, 8587 bool) { 8588 InstructionCost C = 0; 8589 unsigned VF = Mask.size(); 8590 unsigned VecVF = TE->getVectorFactor(); 8591 if (VF != VecVF && 8592 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) || 8593 (all_of(Mask, 8594 [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) && 8595 !ShuffleVectorInst::isIdentityMask(Mask)))) { 8596 SmallVector<int> OrigMask(VecVF, PoisonMaskElem); 8597 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), 8598 OrigMask.begin()); 8599 C = TTI->getShuffleCost( 8600 TTI::SK_PermuteSingleSrc, 8601 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask); 8602 LLVM_DEBUG( 8603 dbgs() << "SLP: Adding cost " << C 8604 << " for final shuffle of insertelement external users.\n"; 8605 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); 8606 Cost += C; 8607 return std::make_pair(TE, true); 8608 } 8609 return std::make_pair(TE, false); 8610 }; 8611 // Calculate the cost of the reshuffled vectors, if any. 8612 for (int I = 0, E = FirstUsers.size(); I < E; ++I) { 8613 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0); 8614 unsigned VF = ShuffleMasks[I].begin()->second.size(); 8615 auto *FTy = FixedVectorType::get( 8616 cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF); 8617 auto Vector = ShuffleMasks[I].takeVector(); 8618 auto &&EstimateShufflesCost = [this, FTy, 8619 &Cost](ArrayRef<int> Mask, 8620 ArrayRef<const TreeEntry *> TEs) { 8621 assert((TEs.size() == 1 || TEs.size() == 2) && 8622 "Expected exactly 1 or 2 tree entries."); 8623 if (TEs.size() == 1) { 8624 int Limit = 2 * Mask.size(); 8625 if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) || 8626 !ShuffleVectorInst::isIdentityMask(Mask)) { 8627 InstructionCost C = 8628 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask); 8629 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 8630 << " for final shuffle of insertelement " 8631 "external users.\n"; 8632 TEs.front()->dump(); 8633 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 8634 Cost += C; 8635 } 8636 } else { 8637 InstructionCost C = 8638 TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); 8639 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 8640 << " for final shuffle of vector node and external " 8641 "insertelement users.\n"; 8642 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); 8643 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 8644 Cost += C; 8645 } 8646 return TEs.back(); 8647 }; 8648 (void)performExtractsShuffleAction<const TreeEntry>( 8649 MutableArrayRef(Vector.data(), Vector.size()), Base, 8650 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, 8651 EstimateShufflesCost); 8652 InstructionCost InsertCost = TTI->getScalarizationOverhead( 8653 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I], 8654 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); 8655 Cost -= InsertCost; 8656 } 8657 8658 #ifndef NDEBUG 8659 SmallString<256> Str; 8660 { 8661 raw_svector_ostream OS(Str); 8662 OS << "SLP: Spill Cost = " << SpillCost << ".\n" 8663 << "SLP: Extract Cost = " << ExtractCost << ".\n" 8664 << "SLP: Total Cost = " << Cost << ".\n"; 8665 } 8666 LLVM_DEBUG(dbgs() << Str); 8667 if (ViewSLPTree) 8668 ViewGraph(this, "SLP" + F->getName(), false, Str); 8669 #endif 8670 8671 return Cost; 8672 } 8673 8674 std::optional<TargetTransformInfo::ShuffleKind> 8675 BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, 8676 SmallVectorImpl<int> &Mask, 8677 SmallVectorImpl<const TreeEntry *> &Entries) { 8678 Entries.clear(); 8679 // No need to check for the topmost gather node. 8680 if (TE == VectorizableTree.front().get()) 8681 return std::nullopt; 8682 Mask.assign(VL.size(), PoisonMaskElem); 8683 assert(TE->UserTreeIndices.size() == 1 && 8684 "Expected only single user of the gather node."); 8685 // TODO: currently checking only for Scalars in the tree entry, need to count 8686 // reused elements too for better cost estimation. 8687 Instruction &UserInst = 8688 getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); 8689 BasicBlock *ParentBB = nullptr; 8690 // Main node of PHI entries keeps the correct order of operands/incoming 8691 // blocks. 8692 if (auto *PHI = 8693 dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) { 8694 ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx); 8695 } else { 8696 ParentBB = UserInst.getParent(); 8697 } 8698 auto *NodeUI = DT->getNode(ParentBB); 8699 assert(NodeUI && "Should only process reachable instructions"); 8700 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); 8701 auto CheckOrdering = [&](Instruction *LastEI) { 8702 // Check if the user node of the TE comes after user node of EntryPtr, 8703 // otherwise EntryPtr depends on TE. 8704 // Gather nodes usually are not scheduled and inserted before their first 8705 // user node. So, instead of checking dependency between the gather nodes 8706 // themselves, we check the dependency between their user nodes. 8707 // If one user node comes before the second one, we cannot use the second 8708 // gather node as the source vector for the first gather node, because in 8709 // the list of instructions it will be emitted later. 8710 auto *EntryParent = LastEI->getParent(); 8711 auto *NodeEUI = DT->getNode(EntryParent); 8712 if (!NodeEUI) 8713 return false; 8714 assert((NodeUI == NodeEUI) == 8715 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && 8716 "Different nodes should have different DFS numbers"); 8717 // Check the order of the gather nodes users. 8718 if (UserInst.getParent() != EntryParent && 8719 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) 8720 return false; 8721 if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI)) 8722 return false; 8723 return true; 8724 }; 8725 // Find all tree entries used by the gathered values. If no common entries 8726 // found - not a shuffle. 8727 // Here we build a set of tree nodes for each gathered value and trying to 8728 // find the intersection between these sets. If we have at least one common 8729 // tree node for each gathered value - we have just a permutation of the 8730 // single vector. If we have 2 different sets, we're in situation where we 8731 // have a permutation of 2 input vectors. 8732 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; 8733 DenseMap<Value *, int> UsedValuesEntry; 8734 for (Value *V : VL) { 8735 if (isConstant(V)) 8736 continue; 8737 // Build a list of tree entries where V is used. 8738 SmallPtrSet<const TreeEntry *, 4> VToTEs; 8739 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { 8740 if (TEPtr == TE) 8741 continue; 8742 assert(any_of(TEPtr->Scalars, 8743 [&](Value *V) { return GatheredScalars.contains(V); }) && 8744 "Must contain at least single gathered value."); 8745 assert(TEPtr->UserTreeIndices.size() == 1 && 8746 "Expected only single user of the gather node."); 8747 PHINode *EntryPHI = 8748 dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp()); 8749 Instruction *EntryUserInst = 8750 EntryPHI ? nullptr 8751 : &getLastInstructionInBundle( 8752 TEPtr->UserTreeIndices.front().UserTE); 8753 if (&UserInst == EntryUserInst) { 8754 assert(!EntryPHI && "Unexpected phi node entry."); 8755 // If 2 gathers are operands of the same entry, compare operands 8756 // indices, use the earlier one as the base. 8757 if (TE->UserTreeIndices.front().UserTE == 8758 TEPtr->UserTreeIndices.front().UserTE && 8759 TE->UserTreeIndices.front().EdgeIdx < 8760 TEPtr->UserTreeIndices.front().EdgeIdx) 8761 continue; 8762 } 8763 // Check if the user node of the TE comes after user node of EntryPtr, 8764 // otherwise EntryPtr depends on TE. 8765 auto *EntryI = 8766 EntryPHI 8767 ? EntryPHI 8768 ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx) 8769 ->getTerminator() 8770 : EntryUserInst; 8771 if ((ParentBB != EntryI->getParent() || 8772 TE->UserTreeIndices.front().EdgeIdx < 8773 TEPtr->UserTreeIndices.front().EdgeIdx || 8774 TE->UserTreeIndices.front().UserTE != 8775 TEPtr->UserTreeIndices.front().UserTE) && 8776 !CheckOrdering(EntryI)) 8777 continue; 8778 VToTEs.insert(TEPtr); 8779 } 8780 if (const TreeEntry *VTE = getTreeEntry(V)) { 8781 Instruction &EntryUserInst = getLastInstructionInBundle(VTE); 8782 if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst)) 8783 continue; 8784 VToTEs.insert(VTE); 8785 } 8786 if (VToTEs.empty()) 8787 continue; 8788 if (UsedTEs.empty()) { 8789 // The first iteration, just insert the list of nodes to vector. 8790 UsedTEs.push_back(VToTEs); 8791 UsedValuesEntry.try_emplace(V, 0); 8792 } else { 8793 // Need to check if there are any previously used tree nodes which use V. 8794 // If there are no such nodes, consider that we have another one input 8795 // vector. 8796 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); 8797 unsigned Idx = 0; 8798 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { 8799 // Do we have a non-empty intersection of previously listed tree entries 8800 // and tree entries using current V? 8801 set_intersect(VToTEs, Set); 8802 if (!VToTEs.empty()) { 8803 // Yes, write the new subset and continue analysis for the next 8804 // scalar. 8805 Set.swap(VToTEs); 8806 break; 8807 } 8808 VToTEs = SavedVToTEs; 8809 ++Idx; 8810 } 8811 // No non-empty intersection found - need to add a second set of possible 8812 // source vectors. 8813 if (Idx == UsedTEs.size()) { 8814 // If the number of input vectors is greater than 2 - not a permutation, 8815 // fallback to the regular gather. 8816 // TODO: support multiple reshuffled nodes. 8817 if (UsedTEs.size() == 2) 8818 continue; 8819 UsedTEs.push_back(SavedVToTEs); 8820 Idx = UsedTEs.size() - 1; 8821 } 8822 UsedValuesEntry.try_emplace(V, Idx); 8823 } 8824 } 8825 8826 if (UsedTEs.empty()) 8827 return std::nullopt; 8828 8829 unsigned VF = 0; 8830 if (UsedTEs.size() == 1) { 8831 // Keep the order to avoid non-determinism. 8832 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(), 8833 UsedTEs.front().end()); 8834 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 8835 return TE1->Idx < TE2->Idx; 8836 }); 8837 // Try to find the perfect match in another gather node at first. 8838 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { 8839 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); 8840 }); 8841 if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) { 8842 Entries.push_back(*It); 8843 std::iota(Mask.begin(), Mask.end(), 0); 8844 // Clear undef scalars. 8845 for (int I = 0, Sz = VL.size(); I < Sz; ++I) 8846 if (isa<PoisonValue>(VL[I])) 8847 Mask[I] = PoisonMaskElem; 8848 return TargetTransformInfo::SK_PermuteSingleSrc; 8849 } 8850 // No perfect match, just shuffle, so choose the first tree node from the 8851 // tree. 8852 Entries.push_back(FirstEntries.front()); 8853 } else { 8854 // Try to find nodes with the same vector factor. 8855 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); 8856 // Keep the order of tree nodes to avoid non-determinism. 8857 DenseMap<int, const TreeEntry *> VFToTE; 8858 for (const TreeEntry *TE : UsedTEs.front()) { 8859 unsigned VF = TE->getVectorFactor(); 8860 auto It = VFToTE.find(VF); 8861 if (It != VFToTE.end()) { 8862 if (It->second->Idx > TE->Idx) 8863 It->getSecond() = TE; 8864 continue; 8865 } 8866 VFToTE.try_emplace(VF, TE); 8867 } 8868 // Same, keep the order to avoid non-determinism. 8869 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(), 8870 UsedTEs.back().end()); 8871 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 8872 return TE1->Idx < TE2->Idx; 8873 }); 8874 for (const TreeEntry *TE : SecondEntries) { 8875 auto It = VFToTE.find(TE->getVectorFactor()); 8876 if (It != VFToTE.end()) { 8877 VF = It->first; 8878 Entries.push_back(It->second); 8879 Entries.push_back(TE); 8880 break; 8881 } 8882 } 8883 // No 2 source vectors with the same vector factor - just choose 2 with max 8884 // index. 8885 if (Entries.empty()) { 8886 Entries.push_back( 8887 *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), 8888 [](const TreeEntry *TE1, const TreeEntry *TE2) { 8889 return TE1->Idx < TE2->Idx; 8890 })); 8891 Entries.push_back(SecondEntries.front()); 8892 VF = std::max(Entries.front()->getVectorFactor(), 8893 Entries.back()->getVectorFactor()); 8894 } 8895 } 8896 8897 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof); 8898 // Checks if the 2 PHIs are compatible in terms of high possibility to be 8899 // vectorized. 8900 auto AreCompatiblePHIs = [&](Value *V, Value *V1) { 8901 auto *PHI = cast<PHINode>(V); 8902 auto *PHI1 = cast<PHINode>(V1); 8903 // Check that all incoming values are compatible/from same parent (if they 8904 // are instructions). 8905 // The incoming values are compatible if they all are constants, or 8906 // instruction with the same/alternate opcodes from the same basic block. 8907 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { 8908 Value *In = PHI->getIncomingValue(I); 8909 Value *In1 = PHI1->getIncomingValue(I); 8910 if (isConstant(In) && isConstant(In1)) 8911 continue; 8912 if (!getSameOpcode({In, In1}, *TLI).getOpcode()) 8913 return false; 8914 if (cast<Instruction>(In)->getParent() != 8915 cast<Instruction>(In1)->getParent()) 8916 return false; 8917 } 8918 return true; 8919 }; 8920 // Check if the value can be ignored during analysis for shuffled gathers. 8921 // We suppose it is better to ignore instruction, which do not form splats, 8922 // are not vectorized/not extractelements (these instructions will be handled 8923 // by extractelements processing) or may form vector node in future. 8924 auto MightBeIgnored = [=](Value *V) { 8925 auto *I = dyn_cast<Instruction>(V); 8926 SmallVector<Value *> IgnoredVals; 8927 if (UserIgnoreList) 8928 IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); 8929 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && 8930 !isVectorLikeInstWithConstOps(I) && 8931 !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); 8932 }; 8933 // Check that the neighbor instruction may form a full vector node with the 8934 // current instruction V. It is possible, if they have same/alternate opcode 8935 // and same parent basic block. 8936 auto NeighborMightBeIgnored = [&](Value *V, int Idx) { 8937 Value *V1 = VL[Idx]; 8938 bool UsedInSameVTE = false; 8939 auto It = UsedValuesEntry.find(V1); 8940 if (It != UsedValuesEntry.end()) 8941 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; 8942 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && 8943 getSameOpcode({V, V1}, *TLI).getOpcode() && 8944 cast<Instruction>(V)->getParent() == 8945 cast<Instruction>(V1)->getParent() && 8946 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1)); 8947 }; 8948 // Build a shuffle mask for better cost estimation and vector emission. 8949 SmallBitVector UsedIdxs(Entries.size()); 8950 SmallVector<std::pair<unsigned, int>> EntryLanes; 8951 for (int I = 0, E = VL.size(); I < E; ++I) { 8952 Value *V = VL[I]; 8953 auto It = UsedValuesEntry.find(V); 8954 if (It == UsedValuesEntry.end()) 8955 continue; 8956 // Do not try to shuffle scalars, if they are constants, or instructions 8957 // that can be vectorized as a result of the following vector build 8958 // vectorization. 8959 if (isConstant(V) || (MightBeIgnored(V) && 8960 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || 8961 (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) 8962 continue; 8963 unsigned Idx = It->second; 8964 EntryLanes.emplace_back(Idx, I); 8965 UsedIdxs.set(Idx); 8966 } 8967 // Iterate through all shuffled scalars and select entries, which can be used 8968 // for final shuffle. 8969 SmallVector<const TreeEntry *> TempEntries; 8970 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { 8971 if (!UsedIdxs.test(I)) 8972 continue; 8973 // Fix the entry number for the given scalar. If it is the first entry, set 8974 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). 8975 // These indices are used when calculating final shuffle mask as the vector 8976 // offset. 8977 for (std::pair<unsigned, int> &Pair : EntryLanes) 8978 if (Pair.first == I) 8979 Pair.first = TempEntries.size(); 8980 TempEntries.push_back(Entries[I]); 8981 } 8982 Entries.swap(TempEntries); 8983 if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { 8984 // We may have here 1 or 2 entries only. If the number of scalars is equal 8985 // to the number of entries, no need to do the analysis, it is not very 8986 // profitable. Since VL is not the same as TE->Scalars, it means we already 8987 // have some shuffles before. Cut off not profitable case. 8988 Entries.clear(); 8989 return std::nullopt; 8990 } 8991 // Build the final mask, check for the identity shuffle, if possible. 8992 bool IsIdentity = Entries.size() == 1; 8993 // Pair.first is the offset to the vector, while Pair.second is the index of 8994 // scalar in the list. 8995 for (const std::pair<unsigned, int> &Pair : EntryLanes) { 8996 Mask[Pair.second] = Pair.first * VF + 8997 Entries[Pair.first]->findLaneForValue(VL[Pair.second]); 8998 IsIdentity &= Mask[Pair.second] == Pair.second; 8999 } 9000 switch (Entries.size()) { 9001 case 1: 9002 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) 9003 return TargetTransformInfo::SK_PermuteSingleSrc; 9004 break; 9005 case 2: 9006 if (EntryLanes.size() > 2 || VL.size() <= 2) 9007 return TargetTransformInfo::SK_PermuteTwoSrc; 9008 break; 9009 default: 9010 break; 9011 } 9012 Entries.clear(); 9013 return std::nullopt; 9014 } 9015 9016 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, 9017 bool ForPoisonSrc) const { 9018 // Find the type of the operands in VL. 9019 Type *ScalarTy = VL[0]->getType(); 9020 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 9021 ScalarTy = SI->getValueOperand()->getType(); 9022 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 9023 bool DuplicateNonConst = false; 9024 // Find the cost of inserting/extracting values from the vector. 9025 // Check if the same elements are inserted several times and count them as 9026 // shuffle candidates. 9027 APInt ShuffledElements = APInt::getZero(VL.size()); 9028 DenseSet<Value *> UniqueElements; 9029 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 9030 InstructionCost Cost; 9031 auto EstimateInsertCost = [&](unsigned I, Value *V) { 9032 if (!ForPoisonSrc) 9033 Cost += 9034 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 9035 I, Constant::getNullValue(VecTy), V); 9036 }; 9037 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 9038 Value *V = VL[I]; 9039 // No need to shuffle duplicates for constants. 9040 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) { 9041 ShuffledElements.setBit(I); 9042 continue; 9043 } 9044 if (!UniqueElements.insert(V).second) { 9045 DuplicateNonConst = true; 9046 ShuffledElements.setBit(I); 9047 continue; 9048 } 9049 EstimateInsertCost(I, V); 9050 } 9051 if (ForPoisonSrc) 9052 Cost = 9053 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true, 9054 /*Extract*/ false, CostKind); 9055 if (DuplicateNonConst) 9056 Cost += 9057 TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 9058 return Cost; 9059 } 9060 9061 // Perform operand reordering on the instructions in VL and return the reordered 9062 // operands in Left and Right. 9063 void BoUpSLP::reorderInputsAccordingToOpcode( 9064 ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, 9065 SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI, 9066 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) { 9067 if (VL.empty()) 9068 return; 9069 VLOperands Ops(VL, TLI, DL, SE, R); 9070 // Reorder the operands in place. 9071 Ops.reorder(); 9072 Left = Ops.getVL(0); 9073 Right = Ops.getVL(1); 9074 } 9075 9076 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { 9077 auto &Res = EntryToLastInstruction.FindAndConstruct(E); 9078 if (Res.second) 9079 return *Res.second; 9080 // Get the basic block this bundle is in. All instructions in the bundle 9081 // should be in this block (except for extractelement-like instructions with 9082 // constant indeces). 9083 auto *Front = E->getMainOp(); 9084 auto *BB = Front->getParent(); 9085 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { 9086 if (E->getOpcode() == Instruction::GetElementPtr && 9087 !isa<GetElementPtrInst>(V)) 9088 return true; 9089 auto *I = cast<Instruction>(V); 9090 return !E->isOpcodeOrAlt(I) || I->getParent() == BB || 9091 isVectorLikeInstWithConstOps(I); 9092 })); 9093 9094 auto FindLastInst = [&]() { 9095 Instruction *LastInst = Front; 9096 for (Value *V : E->Scalars) { 9097 auto *I = dyn_cast<Instruction>(V); 9098 if (!I) 9099 continue; 9100 if (LastInst->getParent() == I->getParent()) { 9101 if (LastInst->comesBefore(I)) 9102 LastInst = I; 9103 continue; 9104 } 9105 assert(((E->getOpcode() == Instruction::GetElementPtr && 9106 !isa<GetElementPtrInst>(I)) || 9107 (isVectorLikeInstWithConstOps(LastInst) && 9108 isVectorLikeInstWithConstOps(I))) && 9109 "Expected vector-like or non-GEP in GEP node insts only."); 9110 if (!DT->isReachableFromEntry(LastInst->getParent())) { 9111 LastInst = I; 9112 continue; 9113 } 9114 if (!DT->isReachableFromEntry(I->getParent())) 9115 continue; 9116 auto *NodeA = DT->getNode(LastInst->getParent()); 9117 auto *NodeB = DT->getNode(I->getParent()); 9118 assert(NodeA && "Should only process reachable instructions"); 9119 assert(NodeB && "Should only process reachable instructions"); 9120 assert((NodeA == NodeB) == 9121 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 9122 "Different nodes should have different DFS numbers"); 9123 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) 9124 LastInst = I; 9125 } 9126 BB = LastInst->getParent(); 9127 return LastInst; 9128 }; 9129 9130 auto FindFirstInst = [&]() { 9131 Instruction *FirstInst = Front; 9132 for (Value *V : E->Scalars) { 9133 auto *I = dyn_cast<Instruction>(V); 9134 if (!I) 9135 continue; 9136 if (FirstInst->getParent() == I->getParent()) { 9137 if (I->comesBefore(FirstInst)) 9138 FirstInst = I; 9139 continue; 9140 } 9141 assert(((E->getOpcode() == Instruction::GetElementPtr && 9142 !isa<GetElementPtrInst>(I)) || 9143 (isVectorLikeInstWithConstOps(FirstInst) && 9144 isVectorLikeInstWithConstOps(I))) && 9145 "Expected vector-like or non-GEP in GEP node insts only."); 9146 if (!DT->isReachableFromEntry(FirstInst->getParent())) { 9147 FirstInst = I; 9148 continue; 9149 } 9150 if (!DT->isReachableFromEntry(I->getParent())) 9151 continue; 9152 auto *NodeA = DT->getNode(FirstInst->getParent()); 9153 auto *NodeB = DT->getNode(I->getParent()); 9154 assert(NodeA && "Should only process reachable instructions"); 9155 assert(NodeB && "Should only process reachable instructions"); 9156 assert((NodeA == NodeB) == 9157 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 9158 "Different nodes should have different DFS numbers"); 9159 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn()) 9160 FirstInst = I; 9161 } 9162 return FirstInst; 9163 }; 9164 9165 // Set the insert point to the beginning of the basic block if the entry 9166 // should not be scheduled. 9167 if (doesNotNeedToSchedule(E->Scalars) || 9168 (E->State != TreeEntry::NeedToGather && 9169 all_of(E->Scalars, isVectorLikeInstWithConstOps))) { 9170 if ((E->getOpcode() == Instruction::GetElementPtr && 9171 any_of(E->Scalars, 9172 [](Value *V) { 9173 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); 9174 })) || 9175 all_of(E->Scalars, [](Value *V) { 9176 return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V); 9177 })) 9178 Res.second = FindLastInst(); 9179 else 9180 Res.second = FindFirstInst(); 9181 return *Res.second; 9182 } 9183 9184 // Find the last instruction. The common case should be that BB has been 9185 // scheduled, and the last instruction is VL.back(). So we start with 9186 // VL.back() and iterate over schedule data until we reach the end of the 9187 // bundle. The end of the bundle is marked by null ScheduleData. 9188 if (BlocksSchedules.count(BB)) { 9189 Value *V = E->isOneOf(E->Scalars.back()); 9190 if (doesNotNeedToBeScheduled(V)) 9191 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); 9192 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); 9193 if (Bundle && Bundle->isPartOfBundle()) 9194 for (; Bundle; Bundle = Bundle->NextInBundle) 9195 if (Bundle->OpValue == Bundle->Inst) 9196 Res.second = Bundle->Inst; 9197 } 9198 9199 // LastInst can still be null at this point if there's either not an entry 9200 // for BB in BlocksSchedules or there's no ScheduleData available for 9201 // VL.back(). This can be the case if buildTree_rec aborts for various 9202 // reasons (e.g., the maximum recursion depth is reached, the maximum region 9203 // size is reached, etc.). ScheduleData is initialized in the scheduling 9204 // "dry-run". 9205 // 9206 // If this happens, we can still find the last instruction by brute force. We 9207 // iterate forwards from Front (inclusive) until we either see all 9208 // instructions in the bundle or reach the end of the block. If Front is the 9209 // last instruction in program order, LastInst will be set to Front, and we 9210 // will visit all the remaining instructions in the block. 9211 // 9212 // One of the reasons we exit early from buildTree_rec is to place an upper 9213 // bound on compile-time. Thus, taking an additional compile-time hit here is 9214 // not ideal. However, this should be exceedingly rare since it requires that 9215 // we both exit early from buildTree_rec and that the bundle be out-of-order 9216 // (causing us to iterate all the way to the end of the block). 9217 if (!Res.second) 9218 Res.second = FindLastInst(); 9219 assert(Res.second && "Failed to find last instruction in bundle"); 9220 return *Res.second; 9221 } 9222 9223 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { 9224 auto *Front = E->getMainOp(); 9225 Instruction *LastInst = &getLastInstructionInBundle(E); 9226 assert(LastInst && "Failed to find last instruction in bundle"); 9227 // If the instruction is PHI, set the insert point after all the PHIs. 9228 bool IsPHI = isa<PHINode>(LastInst); 9229 if (IsPHI) 9230 LastInst = LastInst->getParent()->getFirstNonPHI(); 9231 if (IsPHI || (E->State != TreeEntry::NeedToGather && 9232 doesNotNeedToSchedule(E->Scalars))) { 9233 Builder.SetInsertPoint(LastInst); 9234 } else { 9235 // Set the insertion point after the last instruction in the bundle. Set the 9236 // debug location to Front. 9237 Builder.SetInsertPoint(LastInst->getParent(), 9238 std::next(LastInst->getIterator())); 9239 } 9240 Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 9241 } 9242 9243 Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) { 9244 // List of instructions/lanes from current block and/or the blocks which are 9245 // part of the current loop. These instructions will be inserted at the end to 9246 // make it possible to optimize loops and hoist invariant instructions out of 9247 // the loops body with better chances for success. 9248 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; 9249 SmallSet<int, 4> PostponedIndices; 9250 Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); 9251 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { 9252 SmallPtrSet<BasicBlock *, 4> Visited; 9253 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) 9254 InsertBB = InsertBB->getSinglePredecessor(); 9255 return InsertBB && InsertBB == InstBB; 9256 }; 9257 for (int I = 0, E = VL.size(); I < E; ++I) { 9258 if (auto *Inst = dyn_cast<Instruction>(VL[I])) 9259 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || 9260 getTreeEntry(Inst) || 9261 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && 9262 PostponedIndices.insert(I).second) 9263 PostponedInsts.emplace_back(Inst, I); 9264 } 9265 9266 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) { 9267 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos)); 9268 auto *InsElt = dyn_cast<InsertElementInst>(Vec); 9269 if (!InsElt) 9270 return Vec; 9271 GatherShuffleExtractSeq.insert(InsElt); 9272 CSEBlocks.insert(InsElt->getParent()); 9273 // Add to our 'need-to-extract' list. 9274 if (TreeEntry *Entry = getTreeEntry(V)) { 9275 // Find which lane we need to extract. 9276 unsigned FoundLane = Entry->findLaneForValue(V); 9277 ExternalUses.emplace_back(V, InsElt, FoundLane); 9278 } 9279 return Vec; 9280 }; 9281 Value *Val0 = 9282 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; 9283 FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); 9284 Value *Vec = Root ? Root : PoisonValue::get(VecTy); 9285 SmallVector<int> NonConsts; 9286 // Insert constant values at first. 9287 for (int I = 0, E = VL.size(); I < E; ++I) { 9288 if (PostponedIndices.contains(I)) 9289 continue; 9290 if (!isConstant(VL[I])) { 9291 NonConsts.push_back(I); 9292 continue; 9293 } 9294 if (Root) { 9295 if (!isa<UndefValue>(VL[I])) { 9296 NonConsts.push_back(I); 9297 continue; 9298 } 9299 if (isa<PoisonValue>(VL[I])) 9300 continue; 9301 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) { 9302 if (SV->getMaskValue(I) == PoisonMaskElem) 9303 continue; 9304 } 9305 } 9306 Vec = CreateInsertElement(Vec, VL[I], I); 9307 } 9308 // Insert non-constant values. 9309 for (int I : NonConsts) 9310 Vec = CreateInsertElement(Vec, VL[I], I); 9311 // Append instructions, which are/may be part of the loop, in the end to make 9312 // it possible to hoist non-loop-based instructions. 9313 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) 9314 Vec = CreateInsertElement(Vec, Pair.first, Pair.second); 9315 9316 return Vec; 9317 } 9318 9319 /// Merges shuffle masks and emits final shuffle instruction, if required. It 9320 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 9321 /// when the actual shuffle instruction is generated only if this is actually 9322 /// required. Otherwise, the shuffle instruction emission is delayed till the 9323 /// end of the process, to reduce the number of emitted instructions and further 9324 /// analysis/transformations. 9325 /// The class also will look through the previously emitted shuffle instructions 9326 /// and properly mark indices in mask as undef. 9327 /// For example, given the code 9328 /// \code 9329 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 9330 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 9331 /// \endcode 9332 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 9333 /// look through %s1 and %s2 and emit 9334 /// \code 9335 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 9336 /// \endcode 9337 /// instead. 9338 /// If 2 operands are of different size, the smallest one will be resized and 9339 /// the mask recalculated properly. 9340 /// For example, given the code 9341 /// \code 9342 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 9343 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 9344 /// \endcode 9345 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 9346 /// look through %s1 and %s2 and emit 9347 /// \code 9348 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 9349 /// \endcode 9350 /// instead. 9351 class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { 9352 bool IsFinalized = false; 9353 /// Combined mask for all applied operands and masks. It is built during 9354 /// analysis and actual emission of shuffle vector instructions. 9355 SmallVector<int> CommonMask; 9356 /// List of operands for the shuffle vector instruction. It hold at max 2 9357 /// operands, if the 3rd is going to be added, the first 2 are combined into 9358 /// shuffle with \p CommonMask mask, the first operand sets to be the 9359 /// resulting shuffle and the second operand sets to be the newly added 9360 /// operand. The \p CommonMask is transformed in the proper way after that. 9361 SmallVector<Value *, 2> InVectors; 9362 IRBuilderBase &Builder; 9363 BoUpSLP &R; 9364 9365 class ShuffleIRBuilder { 9366 IRBuilderBase &Builder; 9367 /// Holds all of the instructions that we gathered. 9368 SetVector<Instruction *> &GatherShuffleExtractSeq; 9369 /// A list of blocks that we are going to CSE. 9370 SetVector<BasicBlock *> &CSEBlocks; 9371 9372 public: 9373 ShuffleIRBuilder(IRBuilderBase &Builder, 9374 SetVector<Instruction *> &GatherShuffleExtractSeq, 9375 SetVector<BasicBlock *> &CSEBlocks) 9376 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), 9377 CSEBlocks(CSEBlocks) {} 9378 ~ShuffleIRBuilder() = default; 9379 /// Creates shufflevector for the 2 operands with the given mask. 9380 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) { 9381 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); 9382 if (auto *I = dyn_cast<Instruction>(Vec)) { 9383 GatherShuffleExtractSeq.insert(I); 9384 CSEBlocks.insert(I->getParent()); 9385 } 9386 return Vec; 9387 } 9388 /// Creates permutation of the single vector operand with the given mask, if 9389 /// it is not identity mask. 9390 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) { 9391 if (Mask.empty()) 9392 return V1; 9393 unsigned VF = Mask.size(); 9394 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements(); 9395 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask)) 9396 return V1; 9397 Value *Vec = Builder.CreateShuffleVector(V1, Mask); 9398 if (auto *I = dyn_cast<Instruction>(Vec)) { 9399 GatherShuffleExtractSeq.insert(I); 9400 CSEBlocks.insert(I->getParent()); 9401 } 9402 return Vec; 9403 } 9404 Value *createIdentity(Value *V) { return V; } 9405 Value *createPoison(Type *Ty, unsigned VF) { 9406 return PoisonValue::get(FixedVectorType::get(Ty, VF)); 9407 } 9408 /// Resizes 2 input vector to match the sizes, if the they are not equal 9409 /// yet. The smallest vector is resized to the size of the larger vector. 9410 void resizeToMatch(Value *&V1, Value *&V2) { 9411 if (V1->getType() == V2->getType()) 9412 return; 9413 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 9414 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); 9415 int VF = std::max(V1VF, V2VF); 9416 int MinVF = std::min(V1VF, V2VF); 9417 SmallVector<int> IdentityMask(VF, PoisonMaskElem); 9418 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), 9419 0); 9420 Value *&Op = MinVF == V1VF ? V1 : V2; 9421 Op = Builder.CreateShuffleVector(Op, IdentityMask); 9422 if (auto *I = dyn_cast<Instruction>(Op)) { 9423 GatherShuffleExtractSeq.insert(I); 9424 CSEBlocks.insert(I->getParent()); 9425 } 9426 if (MinVF == V1VF) 9427 V1 = Op; 9428 else 9429 V2 = Op; 9430 } 9431 }; 9432 9433 /// Smart shuffle instruction emission, walks through shuffles trees and 9434 /// tries to find the best matching vector for the actual shuffle 9435 /// instruction. 9436 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) { 9437 assert(V1 && "Expected at least one vector value."); 9438 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, 9439 R.CSEBlocks); 9440 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask, 9441 ShuffleBuilder); 9442 } 9443 9444 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 9445 /// shuffle emission. 9446 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, 9447 ArrayRef<int> Mask) { 9448 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9449 if (Mask[Idx] != PoisonMaskElem) 9450 CommonMask[Idx] = Idx; 9451 } 9452 9453 public: 9454 ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R) 9455 : Builder(Builder), R(R) {} 9456 9457 /// Adjusts extractelements after reusing them. 9458 Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) { 9459 Value *VecBase = nullptr; 9460 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 9461 int Idx = Mask[I]; 9462 if (Idx == PoisonMaskElem) 9463 continue; 9464 auto *EI = cast<ExtractElementInst>(E->Scalars[I]); 9465 VecBase = EI->getVectorOperand(); 9466 // If the only one use is vectorized - can delete the extractelement 9467 // itself. 9468 if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { 9469 return !R.ScalarToTreeEntry.count(U); 9470 })) 9471 continue; 9472 R.eraseInstruction(EI); 9473 } 9474 return VecBase; 9475 } 9476 /// Checks if the specified entry \p E needs to be delayed because of its 9477 /// dependency nodes. 9478 Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) { 9479 // No need to delay emission if all deps are ready. 9480 if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) 9481 return nullptr; 9482 // Postpone gather emission, will be emitted after the end of the 9483 // process to keep correct order. 9484 auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), 9485 E->getVectorFactor()); 9486 return Builder.CreateAlignedLoad( 9487 VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())), 9488 MaybeAlign()); 9489 } 9490 /// Adds 2 input vectors and the mask for their shuffling. 9491 void add(Value *V1, Value *V2, ArrayRef<int> Mask) { 9492 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); 9493 if (InVectors.empty()) { 9494 InVectors.push_back(V1); 9495 InVectors.push_back(V2); 9496 CommonMask.assign(Mask.begin(), Mask.end()); 9497 return; 9498 } 9499 Value *Vec = InVectors.front(); 9500 if (InVectors.size() == 2) { 9501 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 9502 transformMaskAfterShuffle(CommonMask, CommonMask); 9503 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() != 9504 Mask.size()) { 9505 Vec = createShuffle(Vec, nullptr, CommonMask); 9506 transformMaskAfterShuffle(CommonMask, CommonMask); 9507 } 9508 V1 = createShuffle(V1, V2, Mask); 9509 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9510 if (Mask[Idx] != PoisonMaskElem) 9511 CommonMask[Idx] = Idx + Sz; 9512 InVectors.front() = Vec; 9513 if (InVectors.size() == 2) 9514 InVectors.back() = V1; 9515 else 9516 InVectors.push_back(V1); 9517 } 9518 /// Adds another one input vector and the mask for the shuffling. 9519 void add(Value *V1, ArrayRef<int> Mask) { 9520 if (InVectors.empty()) { 9521 if (!isa<FixedVectorType>(V1->getType())) { 9522 V1 = createShuffle(V1, nullptr, CommonMask); 9523 CommonMask.assign(Mask.size(), PoisonMaskElem); 9524 transformMaskAfterShuffle(CommonMask, Mask); 9525 } 9526 InVectors.push_back(V1); 9527 CommonMask.assign(Mask.begin(), Mask.end()); 9528 return; 9529 } 9530 const auto *It = find(InVectors, V1); 9531 if (It == InVectors.end()) { 9532 if (InVectors.size() == 2 || 9533 InVectors.front()->getType() != V1->getType() || 9534 !isa<FixedVectorType>(V1->getType())) { 9535 Value *V = InVectors.front(); 9536 if (InVectors.size() == 2) { 9537 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask); 9538 transformMaskAfterShuffle(CommonMask, CommonMask); 9539 } else if (cast<FixedVectorType>(V->getType())->getNumElements() != 9540 CommonMask.size()) { 9541 V = createShuffle(InVectors.front(), nullptr, CommonMask); 9542 transformMaskAfterShuffle(CommonMask, CommonMask); 9543 } 9544 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9545 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) 9546 CommonMask[Idx] = 9547 V->getType() != V1->getType() 9548 ? Idx + Sz 9549 : Mask[Idx] + cast<FixedVectorType>(V1->getType()) 9550 ->getNumElements(); 9551 if (V->getType() != V1->getType()) 9552 V1 = createShuffle(V1, nullptr, Mask); 9553 InVectors.front() = V; 9554 if (InVectors.size() == 2) 9555 InVectors.back() = V1; 9556 else 9557 InVectors.push_back(V1); 9558 return; 9559 } 9560 // Check if second vector is required if the used elements are already 9561 // used from the first one. 9562 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9563 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) { 9564 InVectors.push_back(V1); 9565 break; 9566 } 9567 } 9568 int VF = CommonMask.size(); 9569 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) 9570 VF = FTy->getNumElements(); 9571 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9572 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 9573 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); 9574 } 9575 /// Adds another one input vector and the mask for the shuffling. 9576 void addOrdered(Value *V1, ArrayRef<unsigned> Order) { 9577 SmallVector<int> NewMask; 9578 inversePermutation(Order, NewMask); 9579 add(V1, NewMask); 9580 } 9581 Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { 9582 return R.gather(VL, Root); 9583 } 9584 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } 9585 /// Finalize emission of the shuffles. 9586 /// \param Action the action (if any) to be performed before final applying of 9587 /// the \p ExtMask mask. 9588 Value * 9589 finalize(ArrayRef<int> ExtMask, unsigned VF = 0, 9590 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 9591 IsFinalized = true; 9592 if (Action) { 9593 Value *Vec = InVectors.front(); 9594 if (InVectors.size() == 2) { 9595 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 9596 InVectors.pop_back(); 9597 } else { 9598 Vec = createShuffle(Vec, nullptr, CommonMask); 9599 } 9600 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9601 if (CommonMask[Idx] != PoisonMaskElem) 9602 CommonMask[Idx] = Idx; 9603 assert(VF > 0 && 9604 "Expected vector length for the final value before action."); 9605 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 9606 if (VecVF < VF) { 9607 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 9608 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0); 9609 Vec = createShuffle(Vec, nullptr, ResizeMask); 9610 } 9611 Action(Vec, CommonMask); 9612 InVectors.front() = Vec; 9613 } 9614 if (!ExtMask.empty()) { 9615 if (CommonMask.empty()) { 9616 CommonMask.assign(ExtMask.begin(), ExtMask.end()); 9617 } else { 9618 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 9619 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 9620 if (ExtMask[I] == PoisonMaskElem) 9621 continue; 9622 NewMask[I] = CommonMask[ExtMask[I]]; 9623 } 9624 CommonMask.swap(NewMask); 9625 } 9626 } 9627 if (CommonMask.empty()) { 9628 assert(InVectors.size() == 1 && "Expected only one vector with no mask"); 9629 return InVectors.front(); 9630 } 9631 if (InVectors.size() == 2) 9632 return createShuffle(InVectors.front(), InVectors.back(), CommonMask); 9633 return createShuffle(InVectors.front(), nullptr, CommonMask); 9634 } 9635 9636 ~ShuffleInstructionBuilder() { 9637 assert((IsFinalized || CommonMask.empty()) && 9638 "Shuffle construction must be finalized."); 9639 } 9640 }; 9641 9642 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { 9643 ArrayRef<Value *> VL = E->getOperand(NodeIdx); 9644 const unsigned VF = VL.size(); 9645 InstructionsState S = getSameOpcode(VL, *TLI); 9646 // Special processing for GEPs bundle, which may include non-gep values. 9647 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { 9648 const auto *It = 9649 find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); 9650 if (It != VL.end()) 9651 S = getSameOpcode(*It, *TLI); 9652 } 9653 if (S.getOpcode()) { 9654 if (TreeEntry *VE = getTreeEntry(S.OpValue); 9655 VE && VE->isSame(VL) && 9656 (any_of(VE->UserTreeIndices, 9657 [E, NodeIdx](const EdgeInfo &EI) { 9658 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 9659 }) || 9660 any_of(VectorizableTree, 9661 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { 9662 return TE->isOperandGatherNode({E, NodeIdx}) && 9663 VE->isSame(TE->Scalars); 9664 }))) { 9665 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) { 9666 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); 9667 ShuffleBuilder.add(V, Mask); 9668 return ShuffleBuilder.finalize(std::nullopt); 9669 }; 9670 Value *V = vectorizeTree(VE); 9671 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { 9672 if (!VE->ReuseShuffleIndices.empty()) { 9673 // Reshuffle to get only unique values. 9674 // If some of the scalars are duplicated in the vectorization 9675 // tree entry, we do not vectorize them but instead generate a 9676 // mask for the reuses. But if there are several users of the 9677 // same entry, they may have different vectorization factors. 9678 // This is especially important for PHI nodes. In this case, we 9679 // need to adapt the resulting instruction for the user 9680 // vectorization factor and have to reshuffle it again to take 9681 // only unique elements of the vector. Without this code the 9682 // function incorrectly returns reduced vector instruction with 9683 // the same elements, not with the unique ones. 9684 9685 // block: 9686 // %phi = phi <2 x > { .., %entry} {%shuffle, %block} 9687 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 9688 // ... (use %2) 9689 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 9690 // br %block 9691 SmallVector<int> UniqueIdxs(VF, PoisonMaskElem); 9692 SmallSet<int, 4> UsedIdxs; 9693 int Pos = 0; 9694 for (int Idx : VE->ReuseShuffleIndices) { 9695 if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem && 9696 UsedIdxs.insert(Idx).second) 9697 UniqueIdxs[Idx] = Pos; 9698 ++Pos; 9699 } 9700 assert(VF >= UsedIdxs.size() && "Expected vectorization factor " 9701 "less than original vector size."); 9702 UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem); 9703 V = FinalShuffle(V, UniqueIdxs); 9704 } else { 9705 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && 9706 "Expected vectorization factor less " 9707 "than original vector size."); 9708 SmallVector<int> UniformMask(VF, 0); 9709 std::iota(UniformMask.begin(), UniformMask.end(), 0); 9710 V = FinalShuffle(V, UniformMask); 9711 } 9712 } 9713 // Need to update the operand gather node, if actually the operand is not a 9714 // vectorized node, but the buildvector/gather node, which matches one of 9715 // the vectorized nodes. 9716 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { 9717 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 9718 }) == VE->UserTreeIndices.end()) { 9719 auto *It = find_if( 9720 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 9721 return TE->State == TreeEntry::NeedToGather && 9722 TE->UserTreeIndices.front().UserTE == E && 9723 TE->UserTreeIndices.front().EdgeIdx == NodeIdx; 9724 }); 9725 assert(It != VectorizableTree.end() && "Expected gather node operand."); 9726 (*It)->VectorizedValue = V; 9727 } 9728 return V; 9729 } 9730 } 9731 9732 // Find the corresponding gather entry and vectorize it. 9733 // Allows to be more accurate with tree/graph transformations, checks for the 9734 // correctness of the transformations in many cases. 9735 auto *I = find_if(VectorizableTree, 9736 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) { 9737 return TE->isOperandGatherNode({E, NodeIdx}); 9738 }); 9739 assert(I != VectorizableTree.end() && "Gather node is not in the graph."); 9740 assert(I->get()->UserTreeIndices.size() == 1 && 9741 "Expected only single user for the gather node."); 9742 assert(I->get()->isSame(VL) && "Expected same list of scalars."); 9743 IRBuilder<>::InsertPointGuard Guard(Builder); 9744 if (E->getOpcode() != Instruction::InsertElement && 9745 E->getOpcode() != Instruction::PHI) { 9746 Instruction *LastInst = &getLastInstructionInBundle(E); 9747 assert(LastInst && "Failed to find last instruction in bundle"); 9748 Builder.SetInsertPoint(LastInst); 9749 } 9750 return vectorizeTree(I->get()); 9751 } 9752 9753 template <typename BVTy, typename ResTy, typename... Args> 9754 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { 9755 assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); 9756 unsigned VF = E->getVectorFactor(); 9757 9758 bool NeedFreeze = false; 9759 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), 9760 E->ReuseShuffleIndices.end()); 9761 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); 9762 // Build a mask out of the reorder indices and reorder scalars per this 9763 // mask. 9764 SmallVector<int> ReorderMask; 9765 inversePermutation(E->ReorderIndices, ReorderMask); 9766 if (!ReorderMask.empty()) 9767 reorderScalars(GatheredScalars, ReorderMask); 9768 auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) { 9769 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { 9770 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 9771 })) 9772 return false; 9773 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; 9774 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; 9775 if (UserTE->getNumOperands() != 2) 9776 return false; 9777 auto *It = 9778 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) { 9779 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { 9780 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; 9781 }) != TE->UserTreeIndices.end(); 9782 }); 9783 if (It == VectorizableTree.end()) 9784 return false; 9785 unsigned I = 9786 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); 9787 int Sz = Mask.size(); 9788 if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) && 9789 ShuffleVectorInst::isIdentityMask(Mask)) 9790 std::iota(Mask.begin(), Mask.end(), 0); 9791 else 9792 std::fill(Mask.begin(), Mask.end(), I); 9793 return true; 9794 }; 9795 BVTy ShuffleBuilder(Params...); 9796 ResTy Res = ResTy(); 9797 SmallVector<int> Mask; 9798 SmallVector<int> ExtractMask; 9799 std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; 9800 std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; 9801 SmallVector<const TreeEntry *> Entries; 9802 Type *ScalarTy = GatheredScalars.front()->getType(); 9803 if (!all_of(GatheredScalars, UndefValue::classof)) { 9804 // Check for gathered extracts. 9805 ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); 9806 SmallVector<Value *> IgnoredVals; 9807 if (UserIgnoreList) 9808 IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); 9809 bool Resized = false; 9810 if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) 9811 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) 9812 if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { 9813 Resized = true; 9814 GatheredScalars.append(VF - GatheredScalars.size(), 9815 PoisonValue::get(ScalarTy)); 9816 } 9817 // Gather extracts after we check for full matched gathers only. 9818 if (ExtractShuffle || E->getOpcode() != Instruction::Load || 9819 E->isAltShuffle() || 9820 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || 9821 isSplat(E->Scalars) || 9822 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { 9823 GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); 9824 } 9825 if (GatherShuffle) { 9826 if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { 9827 // Delay emission of gathers which are not ready yet. 9828 PostponedGathers.insert(E); 9829 // Postpone gather emission, will be emitted after the end of the 9830 // process to keep correct order. 9831 return Delayed; 9832 } 9833 assert((Entries.size() == 1 || Entries.size() == 2) && 9834 "Expected shuffle of 1 or 2 entries."); 9835 if (*GatherShuffle == TTI::SK_PermuteSingleSrc && 9836 Entries.front()->isSame(E->Scalars)) { 9837 // Perfect match in the graph, will reuse the previously vectorized 9838 // node. Cost is 0. 9839 LLVM_DEBUG( 9840 dbgs() 9841 << "SLP: perfect diamond match for gather bundle that starts with " 9842 << *E->Scalars.front() << ".\n"); 9843 // Restore the mask for previous partially matched values. 9844 if (Entries.front()->ReorderIndices.empty() && 9845 ((Entries.front()->ReuseShuffleIndices.empty() && 9846 E->Scalars.size() == Entries.front()->Scalars.size()) || 9847 (E->Scalars.size() == 9848 Entries.front()->ReuseShuffleIndices.size()))) { 9849 std::iota(Mask.begin(), Mask.end(), 0); 9850 } else { 9851 for (auto [I, V] : enumerate(E->Scalars)) { 9852 if (isa<PoisonValue>(V)) { 9853 Mask[I] = PoisonMaskElem; 9854 continue; 9855 } 9856 Mask[I] = Entries.front()->findLaneForValue(V); 9857 } 9858 } 9859 ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); 9860 Res = ShuffleBuilder.finalize(E->getCommonMask()); 9861 return Res; 9862 } 9863 if (!Resized) { 9864 unsigned VF1 = Entries.front()->getVectorFactor(); 9865 unsigned VF2 = Entries.back()->getVectorFactor(); 9866 if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) 9867 GatheredScalars.append(VF - GatheredScalars.size(), 9868 PoisonValue::get(ScalarTy)); 9869 } 9870 // Remove shuffled elements from list of gathers. 9871 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 9872 if (Mask[I] != PoisonMaskElem) 9873 GatheredScalars[I] = PoisonValue::get(ScalarTy); 9874 } 9875 } 9876 } 9877 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars, 9878 SmallVectorImpl<int> &ReuseMask, 9879 bool IsRootPoison) { 9880 // For splats with can emit broadcasts instead of gathers, so try to find 9881 // such sequences. 9882 bool IsSplat = IsRootPoison && isSplat(Scalars) && 9883 (Scalars.size() > 2 || Scalars.front() == Scalars.back()); 9884 Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy)); 9885 SmallVector<int> UndefPos; 9886 DenseMap<Value *, unsigned> UniquePositions; 9887 // Gather unique non-const values and all constant values. 9888 // For repeated values, just shuffle them. 9889 int NumNonConsts = 0; 9890 int SinglePos = 0; 9891 for (auto [I, V] : enumerate(Scalars)) { 9892 if (isa<UndefValue>(V)) { 9893 if (!isa<PoisonValue>(V)) { 9894 ReuseMask[I] = I; 9895 UndefPos.push_back(I); 9896 } 9897 continue; 9898 } 9899 if (isConstant(V)) { 9900 ReuseMask[I] = I; 9901 continue; 9902 } 9903 ++NumNonConsts; 9904 SinglePos = I; 9905 Value *OrigV = V; 9906 Scalars[I] = PoisonValue::get(ScalarTy); 9907 if (IsSplat) { 9908 Scalars.front() = OrigV; 9909 ReuseMask[I] = 0; 9910 } else { 9911 const auto Res = UniquePositions.try_emplace(OrigV, I); 9912 Scalars[Res.first->second] = OrigV; 9913 ReuseMask[I] = Res.first->second; 9914 } 9915 } 9916 if (NumNonConsts == 1) { 9917 // Restore single insert element. 9918 if (IsSplat) { 9919 ReuseMask.assign(VF, PoisonMaskElem); 9920 std::swap(Scalars.front(), Scalars[SinglePos]); 9921 if (!UndefPos.empty() && UndefPos.front() == 0) 9922 Scalars.front() = UndefValue::get(ScalarTy); 9923 } 9924 ReuseMask[SinglePos] = SinglePos; 9925 } else if (!UndefPos.empty() && IsSplat) { 9926 // For undef values, try to replace them with the simple broadcast. 9927 // We can do it if the broadcasted value is guaranteed to be 9928 // non-poisonous, or by freezing the incoming scalar value first. 9929 auto *It = find_if(Scalars, [this, E](Value *V) { 9930 return !isa<UndefValue>(V) && 9931 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || 9932 (E->UserTreeIndices.size() == 1 && 9933 any_of(V->uses(), [E](const Use &U) { 9934 // Check if the value already used in the same operation in 9935 // one of the nodes already. 9936 return E->UserTreeIndices.front().EdgeIdx != 9937 U.getOperandNo() && 9938 is_contained( 9939 E->UserTreeIndices.front().UserTE->Scalars, 9940 U.getUser()); 9941 }))); 9942 }); 9943 if (It != Scalars.end()) { 9944 // Replace undefs by the non-poisoned scalars and emit broadcast. 9945 int Pos = std::distance(Scalars.begin(), It); 9946 for_each(UndefPos, [&](int I) { 9947 // Set the undef position to the non-poisoned scalar. 9948 ReuseMask[I] = Pos; 9949 // Replace the undef by the poison, in the mask it is replaced by 9950 // non-poisoned scalar already. 9951 if (I != Pos) 9952 Scalars[I] = PoisonValue::get(ScalarTy); 9953 }); 9954 } else { 9955 // Replace undefs by the poisons, emit broadcast and then emit 9956 // freeze. 9957 for_each(UndefPos, [&](int I) { 9958 ReuseMask[I] = PoisonMaskElem; 9959 if (isa<UndefValue>(Scalars[I])) 9960 Scalars[I] = PoisonValue::get(ScalarTy); 9961 }); 9962 NeedFreeze = true; 9963 } 9964 } 9965 }; 9966 if (ExtractShuffle || GatherShuffle) { 9967 bool IsNonPoisoned = true; 9968 bool IsUsedInExpr = false; 9969 Value *Vec1 = nullptr; 9970 if (ExtractShuffle) { 9971 // Gather of extractelements can be represented as just a shuffle of 9972 // a single/two vectors the scalars are extracted from. 9973 // Find input vectors. 9974 Value *Vec2 = nullptr; 9975 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { 9976 if (ExtractMask[I] == PoisonMaskElem || 9977 (!Mask.empty() && Mask[I] != PoisonMaskElem)) { 9978 ExtractMask[I] = PoisonMaskElem; 9979 continue; 9980 } 9981 if (isa<UndefValue>(E->Scalars[I])) 9982 continue; 9983 auto *EI = cast<ExtractElementInst>(E->Scalars[I]); 9984 if (!Vec1) { 9985 Vec1 = EI->getVectorOperand(); 9986 } else if (Vec1 != EI->getVectorOperand()) { 9987 assert((!Vec2 || Vec2 == EI->getVectorOperand()) && 9988 "Expected only 1 or 2 vectors shuffle."); 9989 Vec2 = EI->getVectorOperand(); 9990 } 9991 } 9992 if (Vec2) { 9993 IsNonPoisoned &= 9994 isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); 9995 ShuffleBuilder.add(Vec1, Vec2, ExtractMask); 9996 } else if (Vec1) { 9997 IsUsedInExpr = FindReusedSplat(ExtractMask); 9998 ShuffleBuilder.add(Vec1, ExtractMask); 9999 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); 10000 } else { 10001 ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( 10002 ScalarTy, GatheredScalars.size())), 10003 ExtractMask); 10004 } 10005 } 10006 if (GatherShuffle) { 10007 if (Entries.size() == 1) { 10008 IsUsedInExpr = FindReusedSplat(Mask); 10009 ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); 10010 IsNonPoisoned &= 10011 isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); 10012 } else { 10013 ShuffleBuilder.add(Entries.front()->VectorizedValue, 10014 Entries.back()->VectorizedValue, Mask); 10015 IsNonPoisoned &= 10016 isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && 10017 isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); 10018 } 10019 } 10020 // Try to figure out best way to combine values: build a shuffle and insert 10021 // elements or just build several shuffles. 10022 // Insert non-constant scalars. 10023 SmallVector<Value *> NonConstants(GatheredScalars); 10024 int EMSz = ExtractMask.size(); 10025 int MSz = Mask.size(); 10026 // Try to build constant vector and shuffle with it only if currently we 10027 // have a single permutation and more than 1 scalar constants. 10028 bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; 10029 bool IsIdentityShuffle = 10030 (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == 10031 TTI::SK_PermuteSingleSrc && 10032 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && 10033 ShuffleVectorInst::isIdentityMask(ExtractMask)) || 10034 (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == 10035 TTI::SK_PermuteSingleSrc && 10036 none_of(Mask, [&](int I) { return I >= MSz; }) && 10037 ShuffleVectorInst::isIdentityMask(Mask)); 10038 bool EnoughConstsForShuffle = 10039 IsSingleShuffle && 10040 (none_of(GatheredScalars, 10041 [](Value *V) { 10042 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 10043 }) || 10044 any_of(GatheredScalars, 10045 [](Value *V) { 10046 return isa<Constant>(V) && !isa<UndefValue>(V); 10047 })) && 10048 (!IsIdentityShuffle || 10049 (GatheredScalars.size() == 2 && 10050 any_of(GatheredScalars, 10051 [](Value *V) { return !isa<UndefValue>(V); })) || 10052 count_if(GatheredScalars, [](Value *V) { 10053 return isa<Constant>(V) && !isa<PoisonValue>(V); 10054 }) > 1); 10055 // NonConstants array contains just non-constant values, GatheredScalars 10056 // contains only constant to build final vector and then shuffle. 10057 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { 10058 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I])) 10059 NonConstants[I] = PoisonValue::get(ScalarTy); 10060 else 10061 GatheredScalars[I] = PoisonValue::get(ScalarTy); 10062 } 10063 // Generate constants for final shuffle and build a mask for them. 10064 if (!all_of(GatheredScalars, PoisonValue::classof)) { 10065 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); 10066 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); 10067 Value *BV = ShuffleBuilder.gather(GatheredScalars); 10068 ShuffleBuilder.add(BV, BVMask); 10069 } 10070 if (all_of(NonConstants, [=](Value *V) { 10071 return isa<PoisonValue>(V) || 10072 (IsSingleShuffle && ((IsIdentityShuffle && 10073 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V)); 10074 })) 10075 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 10076 else 10077 Res = ShuffleBuilder.finalize( 10078 E->ReuseShuffleIndices, E->Scalars.size(), 10079 [&](Value *&Vec, SmallVectorImpl<int> &Mask) { 10080 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); 10081 Vec = ShuffleBuilder.gather(NonConstants, Vec); 10082 }); 10083 } else if (!allConstant(GatheredScalars)) { 10084 // Gather unique scalars and all constants. 10085 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); 10086 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); 10087 Value *BV = ShuffleBuilder.gather(GatheredScalars); 10088 ShuffleBuilder.add(BV, ReuseMask); 10089 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 10090 } else { 10091 // Gather all constants. 10092 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem); 10093 for (auto [I, V] : enumerate(E->Scalars)) { 10094 if (!isa<PoisonValue>(V)) 10095 Mask[I] = I; 10096 } 10097 Value *BV = ShuffleBuilder.gather(E->Scalars); 10098 ShuffleBuilder.add(BV, Mask); 10099 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 10100 } 10101 10102 if (NeedFreeze) 10103 Res = ShuffleBuilder.createFreeze(Res); 10104 return Res; 10105 } 10106 10107 Value *BoUpSLP::createBuildVector(const TreeEntry *E) { 10108 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder, 10109 *this); 10110 } 10111 10112 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 10113 IRBuilder<>::InsertPointGuard Guard(Builder); 10114 10115 if (E->VectorizedValue) { 10116 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 10117 return E->VectorizedValue; 10118 } 10119 10120 if (E->State == TreeEntry::NeedToGather) { 10121 // Set insert point for non-reduction initial nodes. 10122 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) 10123 setInsertPointAfterBundle(E); 10124 Value *Vec = createBuildVector(E); 10125 E->VectorizedValue = Vec; 10126 return Vec; 10127 } 10128 10129 auto FinalShuffle = [&](Value *V, const TreeEntry *E) { 10130 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); 10131 if (E->getOpcode() == Instruction::Store) { 10132 ArrayRef<int> Mask = 10133 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), 10134 E->ReorderIndices.size()); 10135 ShuffleBuilder.add(V, Mask); 10136 } else { 10137 ShuffleBuilder.addOrdered(V, E->ReorderIndices); 10138 } 10139 return ShuffleBuilder.finalize(E->ReuseShuffleIndices); 10140 }; 10141 10142 assert((E->State == TreeEntry::Vectorize || 10143 E->State == TreeEntry::ScatterVectorize) && 10144 "Unhandled state"); 10145 unsigned ShuffleOrOp = 10146 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 10147 Instruction *VL0 = E->getMainOp(); 10148 Type *ScalarTy = VL0->getType(); 10149 if (auto *Store = dyn_cast<StoreInst>(VL0)) 10150 ScalarTy = Store->getValueOperand()->getType(); 10151 else if (auto *IE = dyn_cast<InsertElementInst>(VL0)) 10152 ScalarTy = IE->getOperand(1)->getType(); 10153 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); 10154 switch (ShuffleOrOp) { 10155 case Instruction::PHI: { 10156 assert((E->ReorderIndices.empty() || 10157 E != VectorizableTree.front().get() || 10158 !E->UserTreeIndices.empty()) && 10159 "PHI reordering is free."); 10160 auto *PH = cast<PHINode>(VL0); 10161 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 10162 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 10163 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 10164 Value *V = NewPhi; 10165 10166 // Adjust insertion point once all PHI's have been generated. 10167 Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt()); 10168 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 10169 10170 V = FinalShuffle(V, E); 10171 10172 E->VectorizedValue = V; 10173 10174 // PHINodes may have multiple entries from the same block. We want to 10175 // visit every block once. 10176 SmallPtrSet<BasicBlock *, 4> VisitedBBs; 10177 10178 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 10179 ValueList Operands; 10180 BasicBlock *IBB = PH->getIncomingBlock(i); 10181 10182 // Stop emission if all incoming values are generated. 10183 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { 10184 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10185 return V; 10186 } 10187 10188 if (!VisitedBBs.insert(IBB).second) { 10189 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 10190 continue; 10191 } 10192 10193 Builder.SetInsertPoint(IBB->getTerminator()); 10194 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 10195 Value *Vec = vectorizeOperand(E, i); 10196 NewPhi->addIncoming(Vec, IBB); 10197 } 10198 10199 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 10200 "Invalid number of incoming values"); 10201 return V; 10202 } 10203 10204 case Instruction::ExtractElement: { 10205 Value *V = E->getSingleOperand(0); 10206 setInsertPointAfterBundle(E); 10207 V = FinalShuffle(V, E); 10208 E->VectorizedValue = V; 10209 return V; 10210 } 10211 case Instruction::ExtractValue: { 10212 auto *LI = cast<LoadInst>(E->getSingleOperand(0)); 10213 Builder.SetInsertPoint(LI); 10214 auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); 10215 Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); 10216 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); 10217 Value *NewV = propagateMetadata(V, E->Scalars); 10218 NewV = FinalShuffle(NewV, E); 10219 E->VectorizedValue = NewV; 10220 return NewV; 10221 } 10222 case Instruction::InsertElement: { 10223 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); 10224 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); 10225 Value *V = vectorizeOperand(E, 1); 10226 10227 // Create InsertVector shuffle if necessary 10228 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 10229 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 10230 })); 10231 const unsigned NumElts = 10232 cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); 10233 const unsigned NumScalars = E->Scalars.size(); 10234 10235 unsigned Offset = *getInsertIndex(VL0); 10236 assert(Offset < NumElts && "Failed to find vector index offset"); 10237 10238 // Create shuffle to resize vector 10239 SmallVector<int> Mask; 10240 if (!E->ReorderIndices.empty()) { 10241 inversePermutation(E->ReorderIndices, Mask); 10242 Mask.append(NumElts - NumScalars, PoisonMaskElem); 10243 } else { 10244 Mask.assign(NumElts, PoisonMaskElem); 10245 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); 10246 } 10247 // Create InsertVector shuffle if necessary 10248 bool IsIdentity = true; 10249 SmallVector<int> PrevMask(NumElts, PoisonMaskElem); 10250 Mask.swap(PrevMask); 10251 for (unsigned I = 0; I < NumScalars; ++I) { 10252 Value *Scalar = E->Scalars[PrevMask[I]]; 10253 unsigned InsertIdx = *getInsertIndex(Scalar); 10254 IsIdentity &= InsertIdx - Offset == I; 10255 Mask[InsertIdx - Offset] = I; 10256 } 10257 if (!IsIdentity || NumElts != NumScalars) { 10258 V = Builder.CreateShuffleVector(V, Mask); 10259 if (auto *I = dyn_cast<Instruction>(V)) { 10260 GatherShuffleExtractSeq.insert(I); 10261 CSEBlocks.insert(I->getParent()); 10262 } 10263 } 10264 10265 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 10266 for (unsigned I = 0; I < NumElts; I++) { 10267 if (Mask[I] != PoisonMaskElem) 10268 InsertMask[Offset + I] = I; 10269 } 10270 SmallBitVector UseMask = 10271 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); 10272 SmallBitVector IsFirstUndef = 10273 isUndefVector(FirstInsert->getOperand(0), UseMask); 10274 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && 10275 NumElts != NumScalars) { 10276 if (IsFirstUndef.all()) { 10277 if (!ShuffleVectorInst::isIdentityMask(InsertMask)) { 10278 SmallBitVector IsFirstPoison = 10279 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 10280 if (!IsFirstPoison.all()) { 10281 for (unsigned I = 0; I < NumElts; I++) { 10282 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) 10283 InsertMask[I] = I + NumElts; 10284 } 10285 } 10286 V = Builder.CreateShuffleVector( 10287 V, 10288 IsFirstPoison.all() ? PoisonValue::get(V->getType()) 10289 : FirstInsert->getOperand(0), 10290 InsertMask, cast<Instruction>(E->Scalars.back())->getName()); 10291 if (auto *I = dyn_cast<Instruction>(V)) { 10292 GatherShuffleExtractSeq.insert(I); 10293 CSEBlocks.insert(I->getParent()); 10294 } 10295 } 10296 } else { 10297 SmallBitVector IsFirstPoison = 10298 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 10299 for (unsigned I = 0; I < NumElts; I++) { 10300 if (InsertMask[I] == PoisonMaskElem) 10301 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I; 10302 else 10303 InsertMask[I] += NumElts; 10304 } 10305 V = Builder.CreateShuffleVector( 10306 FirstInsert->getOperand(0), V, InsertMask, 10307 cast<Instruction>(E->Scalars.back())->getName()); 10308 if (auto *I = dyn_cast<Instruction>(V)) { 10309 GatherShuffleExtractSeq.insert(I); 10310 CSEBlocks.insert(I->getParent()); 10311 } 10312 } 10313 } 10314 10315 ++NumVectorInstructions; 10316 E->VectorizedValue = V; 10317 return V; 10318 } 10319 case Instruction::ZExt: 10320 case Instruction::SExt: 10321 case Instruction::FPToUI: 10322 case Instruction::FPToSI: 10323 case Instruction::FPExt: 10324 case Instruction::PtrToInt: 10325 case Instruction::IntToPtr: 10326 case Instruction::SIToFP: 10327 case Instruction::UIToFP: 10328 case Instruction::Trunc: 10329 case Instruction::FPTrunc: 10330 case Instruction::BitCast: { 10331 setInsertPointAfterBundle(E); 10332 10333 Value *InVec = vectorizeOperand(E, 0); 10334 if (E->VectorizedValue) { 10335 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10336 return E->VectorizedValue; 10337 } 10338 10339 auto *CI = cast<CastInst>(VL0); 10340 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 10341 V = FinalShuffle(V, E); 10342 10343 E->VectorizedValue = V; 10344 ++NumVectorInstructions; 10345 return V; 10346 } 10347 case Instruction::FCmp: 10348 case Instruction::ICmp: { 10349 setInsertPointAfterBundle(E); 10350 10351 Value *L = vectorizeOperand(E, 0); 10352 if (E->VectorizedValue) { 10353 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10354 return E->VectorizedValue; 10355 } 10356 Value *R = vectorizeOperand(E, 1); 10357 if (E->VectorizedValue) { 10358 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10359 return E->VectorizedValue; 10360 } 10361 10362 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 10363 Value *V = Builder.CreateCmp(P0, L, R); 10364 propagateIRFlags(V, E->Scalars, VL0); 10365 V = FinalShuffle(V, E); 10366 10367 E->VectorizedValue = V; 10368 ++NumVectorInstructions; 10369 return V; 10370 } 10371 case Instruction::Select: { 10372 setInsertPointAfterBundle(E); 10373 10374 Value *Cond = vectorizeOperand(E, 0); 10375 if (E->VectorizedValue) { 10376 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10377 return E->VectorizedValue; 10378 } 10379 Value *True = vectorizeOperand(E, 1); 10380 if (E->VectorizedValue) { 10381 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10382 return E->VectorizedValue; 10383 } 10384 Value *False = vectorizeOperand(E, 2); 10385 if (E->VectorizedValue) { 10386 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10387 return E->VectorizedValue; 10388 } 10389 10390 Value *V = Builder.CreateSelect(Cond, True, False); 10391 V = FinalShuffle(V, E); 10392 10393 E->VectorizedValue = V; 10394 ++NumVectorInstructions; 10395 return V; 10396 } 10397 case Instruction::FNeg: { 10398 setInsertPointAfterBundle(E); 10399 10400 Value *Op = vectorizeOperand(E, 0); 10401 10402 if (E->VectorizedValue) { 10403 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10404 return E->VectorizedValue; 10405 } 10406 10407 Value *V = Builder.CreateUnOp( 10408 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); 10409 propagateIRFlags(V, E->Scalars, VL0); 10410 if (auto *I = dyn_cast<Instruction>(V)) 10411 V = propagateMetadata(I, E->Scalars); 10412 10413 V = FinalShuffle(V, E); 10414 10415 E->VectorizedValue = V; 10416 ++NumVectorInstructions; 10417 10418 return V; 10419 } 10420 case Instruction::Add: 10421 case Instruction::FAdd: 10422 case Instruction::Sub: 10423 case Instruction::FSub: 10424 case Instruction::Mul: 10425 case Instruction::FMul: 10426 case Instruction::UDiv: 10427 case Instruction::SDiv: 10428 case Instruction::FDiv: 10429 case Instruction::URem: 10430 case Instruction::SRem: 10431 case Instruction::FRem: 10432 case Instruction::Shl: 10433 case Instruction::LShr: 10434 case Instruction::AShr: 10435 case Instruction::And: 10436 case Instruction::Or: 10437 case Instruction::Xor: { 10438 setInsertPointAfterBundle(E); 10439 10440 Value *LHS = vectorizeOperand(E, 0); 10441 if (E->VectorizedValue) { 10442 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10443 return E->VectorizedValue; 10444 } 10445 Value *RHS = vectorizeOperand(E, 1); 10446 if (E->VectorizedValue) { 10447 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10448 return E->VectorizedValue; 10449 } 10450 10451 Value *V = Builder.CreateBinOp( 10452 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, 10453 RHS); 10454 propagateIRFlags(V, E->Scalars, VL0); 10455 if (auto *I = dyn_cast<Instruction>(V)) 10456 V = propagateMetadata(I, E->Scalars); 10457 10458 V = FinalShuffle(V, E); 10459 10460 E->VectorizedValue = V; 10461 ++NumVectorInstructions; 10462 10463 return V; 10464 } 10465 case Instruction::Load: { 10466 // Loads are inserted at the head of the tree because we don't want to 10467 // sink them all the way down past store instructions. 10468 setInsertPointAfterBundle(E); 10469 10470 LoadInst *LI = cast<LoadInst>(VL0); 10471 Instruction *NewLI; 10472 Value *PO = LI->getPointerOperand(); 10473 if (E->State == TreeEntry::Vectorize) { 10474 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); 10475 10476 // The pointer operand uses an in-tree scalar so we add the new 10477 // LoadInst to ExternalUses list to make sure that an extract will 10478 // be generated in the future. 10479 if (TreeEntry *Entry = getTreeEntry(PO)) { 10480 // Find which lane we need to extract. 10481 unsigned FoundLane = Entry->findLaneForValue(PO); 10482 ExternalUses.emplace_back(PO, NewLI, FoundLane); 10483 } 10484 } else { 10485 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); 10486 Value *VecPtr = vectorizeOperand(E, 0); 10487 if (E->VectorizedValue) { 10488 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10489 return E->VectorizedValue; 10490 } 10491 // Use the minimum alignment of the gathered loads. 10492 Align CommonAlignment = LI->getAlign(); 10493 for (Value *V : E->Scalars) 10494 CommonAlignment = 10495 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); 10496 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); 10497 } 10498 Value *V = propagateMetadata(NewLI, E->Scalars); 10499 10500 V = FinalShuffle(V, E); 10501 E->VectorizedValue = V; 10502 ++NumVectorInstructions; 10503 return V; 10504 } 10505 case Instruction::Store: { 10506 auto *SI = cast<StoreInst>(VL0); 10507 unsigned AS = SI->getPointerAddressSpace(); 10508 10509 setInsertPointAfterBundle(E); 10510 10511 Value *VecValue = vectorizeOperand(E, 0); 10512 VecValue = FinalShuffle(VecValue, E); 10513 10514 Value *ScalarPtr = SI->getPointerOperand(); 10515 Value *VecPtr = Builder.CreateBitCast( 10516 ScalarPtr, VecValue->getType()->getPointerTo(AS)); 10517 StoreInst *ST = 10518 Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); 10519 10520 // The pointer operand uses an in-tree scalar, so add the new BitCast or 10521 // StoreInst to ExternalUses to make sure that an extract will be 10522 // generated in the future. 10523 if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) { 10524 // Find which lane we need to extract. 10525 unsigned FoundLane = Entry->findLaneForValue(ScalarPtr); 10526 ExternalUses.push_back(ExternalUser( 10527 ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST, 10528 FoundLane)); 10529 } 10530 10531 Value *V = propagateMetadata(ST, E->Scalars); 10532 10533 E->VectorizedValue = V; 10534 ++NumVectorInstructions; 10535 return V; 10536 } 10537 case Instruction::GetElementPtr: { 10538 auto *GEP0 = cast<GetElementPtrInst>(VL0); 10539 setInsertPointAfterBundle(E); 10540 10541 Value *Op0 = vectorizeOperand(E, 0); 10542 if (E->VectorizedValue) { 10543 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10544 return E->VectorizedValue; 10545 } 10546 10547 SmallVector<Value *> OpVecs; 10548 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { 10549 Value *OpVec = vectorizeOperand(E, J); 10550 if (E->VectorizedValue) { 10551 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10552 return E->VectorizedValue; 10553 } 10554 OpVecs.push_back(OpVec); 10555 } 10556 10557 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); 10558 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) { 10559 SmallVector<Value *> GEPs; 10560 for (Value *V : E->Scalars) { 10561 if (isa<GetElementPtrInst>(V)) 10562 GEPs.push_back(V); 10563 } 10564 V = propagateMetadata(I, GEPs); 10565 } 10566 10567 V = FinalShuffle(V, E); 10568 10569 E->VectorizedValue = V; 10570 ++NumVectorInstructions; 10571 10572 return V; 10573 } 10574 case Instruction::Call: { 10575 CallInst *CI = cast<CallInst>(VL0); 10576 setInsertPointAfterBundle(E); 10577 10578 Intrinsic::ID IID = Intrinsic::not_intrinsic; 10579 if (Function *FI = CI->getCalledFunction()) 10580 IID = FI->getIntrinsicID(); 10581 10582 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 10583 10584 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 10585 bool UseIntrinsic = ID != Intrinsic::not_intrinsic && 10586 VecCallCosts.first <= VecCallCosts.second; 10587 10588 Value *ScalarArg = nullptr; 10589 std::vector<Value *> OpVecs; 10590 SmallVector<Type *, 2> TysForDecl; 10591 // Add return type if intrinsic is overloaded on it. 10592 if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1)) 10593 TysForDecl.push_back( 10594 FixedVectorType::get(CI->getType(), E->Scalars.size())); 10595 for (int j = 0, e = CI->arg_size(); j < e; ++j) { 10596 ValueList OpVL; 10597 // Some intrinsics have scalar arguments. This argument should not be 10598 // vectorized. 10599 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) { 10600 CallInst *CEI = cast<CallInst>(VL0); 10601 ScalarArg = CEI->getArgOperand(j); 10602 OpVecs.push_back(CEI->getArgOperand(j)); 10603 if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) 10604 TysForDecl.push_back(ScalarArg->getType()); 10605 continue; 10606 } 10607 10608 Value *OpVec = vectorizeOperand(E, j); 10609 if (E->VectorizedValue) { 10610 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10611 return E->VectorizedValue; 10612 } 10613 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 10614 OpVecs.push_back(OpVec); 10615 if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) 10616 TysForDecl.push_back(OpVec->getType()); 10617 } 10618 10619 Function *CF; 10620 if (!UseIntrinsic) { 10621 VFShape Shape = 10622 VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 10623 VecTy->getNumElements())), 10624 false /*HasGlobalPred*/); 10625 CF = VFDatabase(*CI).getVectorizedFunction(Shape); 10626 } else { 10627 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); 10628 } 10629 10630 SmallVector<OperandBundleDef, 1> OpBundles; 10631 CI->getOperandBundlesAsDefs(OpBundles); 10632 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); 10633 10634 // The scalar argument uses an in-tree scalar so we add the new vectorized 10635 // call to ExternalUses list to make sure that an extract will be 10636 // generated in the future. 10637 if (ScalarArg) { 10638 if (TreeEntry *Entry = getTreeEntry(ScalarArg)) { 10639 // Find which lane we need to extract. 10640 unsigned FoundLane = Entry->findLaneForValue(ScalarArg); 10641 ExternalUses.push_back( 10642 ExternalUser(ScalarArg, cast<User>(V), FoundLane)); 10643 } 10644 } 10645 10646 propagateIRFlags(V, E->Scalars, VL0); 10647 V = FinalShuffle(V, E); 10648 10649 E->VectorizedValue = V; 10650 ++NumVectorInstructions; 10651 return V; 10652 } 10653 case Instruction::ShuffleVector: { 10654 assert(E->isAltShuffle() && 10655 ((Instruction::isBinaryOp(E->getOpcode()) && 10656 Instruction::isBinaryOp(E->getAltOpcode())) || 10657 (Instruction::isCast(E->getOpcode()) && 10658 Instruction::isCast(E->getAltOpcode())) || 10659 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 10660 "Invalid Shuffle Vector Operand"); 10661 10662 Value *LHS = nullptr, *RHS = nullptr; 10663 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { 10664 setInsertPointAfterBundle(E); 10665 LHS = vectorizeOperand(E, 0); 10666 if (E->VectorizedValue) { 10667 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10668 return E->VectorizedValue; 10669 } 10670 RHS = vectorizeOperand(E, 1); 10671 } else { 10672 setInsertPointAfterBundle(E); 10673 LHS = vectorizeOperand(E, 0); 10674 } 10675 if (E->VectorizedValue) { 10676 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 10677 return E->VectorizedValue; 10678 } 10679 10680 Value *V0, *V1; 10681 if (Instruction::isBinaryOp(E->getOpcode())) { 10682 V0 = Builder.CreateBinOp( 10683 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); 10684 V1 = Builder.CreateBinOp( 10685 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); 10686 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 10687 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); 10688 auto *AltCI = cast<CmpInst>(E->getAltOp()); 10689 CmpInst::Predicate AltPred = AltCI->getPredicate(); 10690 V1 = Builder.CreateCmp(AltPred, LHS, RHS); 10691 } else { 10692 V0 = Builder.CreateCast( 10693 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); 10694 V1 = Builder.CreateCast( 10695 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); 10696 } 10697 // Add V0 and V1 to later analysis to try to find and remove matching 10698 // instruction, if any. 10699 for (Value *V : {V0, V1}) { 10700 if (auto *I = dyn_cast<Instruction>(V)) { 10701 GatherShuffleExtractSeq.insert(I); 10702 CSEBlocks.insert(I->getParent()); 10703 } 10704 } 10705 10706 // Create shuffle to take alternate operations from the vector. 10707 // Also, gather up main and alt scalar ops to propagate IR flags to 10708 // each vector operation. 10709 ValueList OpScalars, AltScalars; 10710 SmallVector<int> Mask; 10711 buildShuffleEntryMask( 10712 E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, 10713 [E, this](Instruction *I) { 10714 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 10715 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), 10716 *TLI); 10717 }, 10718 Mask, &OpScalars, &AltScalars); 10719 10720 propagateIRFlags(V0, OpScalars); 10721 propagateIRFlags(V1, AltScalars); 10722 10723 Value *V = Builder.CreateShuffleVector(V0, V1, Mask); 10724 if (auto *I = dyn_cast<Instruction>(V)) { 10725 V = propagateMetadata(I, E->Scalars); 10726 GatherShuffleExtractSeq.insert(I); 10727 CSEBlocks.insert(I->getParent()); 10728 } 10729 10730 E->VectorizedValue = V; 10731 ++NumVectorInstructions; 10732 10733 return V; 10734 } 10735 default: 10736 llvm_unreachable("unknown inst"); 10737 } 10738 return nullptr; 10739 } 10740 10741 Value *BoUpSLP::vectorizeTree() { 10742 ExtraValueToDebugLocsMap ExternallyUsedValues; 10743 SmallVector<std::pair<Value *, Value *>> ReplacedExternals; 10744 return vectorizeTree(ExternallyUsedValues, ReplacedExternals); 10745 } 10746 10747 namespace { 10748 /// Data type for handling buildvector sequences with the reused scalars from 10749 /// other tree entries. 10750 struct ShuffledInsertData { 10751 /// List of insertelements to be replaced by shuffles. 10752 SmallVector<InsertElementInst *> InsertElements; 10753 /// The parent vectors and shuffle mask for the given list of inserts. 10754 MapVector<Value *, SmallVector<int>> ValueMasks; 10755 }; 10756 } // namespace 10757 10758 Value *BoUpSLP::vectorizeTree( 10759 const ExtraValueToDebugLocsMap &ExternallyUsedValues, 10760 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, 10761 Instruction *ReductionRoot) { 10762 // All blocks must be scheduled before any instructions are inserted. 10763 for (auto &BSIter : BlocksSchedules) { 10764 scheduleBlock(BSIter.second.get()); 10765 } 10766 // Clean Entry-to-LastInstruction table. It can be affected after scheduling, 10767 // need to rebuild it. 10768 EntryToLastInstruction.clear(); 10769 10770 Builder.SetInsertPoint(ReductionRoot ? ReductionRoot 10771 : &F->getEntryBlock().front()); 10772 auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); 10773 // Run through the list of postponed gathers and emit them, replacing the temp 10774 // emitted allocas with actual vector instructions. 10775 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); 10776 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues; 10777 for (const TreeEntry *E : PostponedNodes) { 10778 auto *TE = const_cast<TreeEntry *>(E); 10779 if (auto *VecTE = getTreeEntry(TE->Scalars.front())) 10780 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand( 10781 TE->UserTreeIndices.front().EdgeIdx))) 10782 // Found gather node which is absolutely the same as one of the 10783 // vectorized nodes. It may happen after reordering. 10784 continue; 10785 auto *PrevVec = cast<Instruction>(TE->VectorizedValue); 10786 TE->VectorizedValue = nullptr; 10787 auto *UserI = 10788 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue); 10789 Builder.SetInsertPoint(PrevVec); 10790 Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); 10791 Value *Vec = vectorizeTree(TE); 10792 PrevVec->replaceAllUsesWith(Vec); 10793 PostponedValues.try_emplace(Vec).first->second.push_back(TE); 10794 // Replace the stub vector node, if it was used before for one of the 10795 // buildvector nodes already. 10796 auto It = PostponedValues.find(PrevVec); 10797 if (It != PostponedValues.end()) { 10798 for (TreeEntry *VTE : It->getSecond()) 10799 VTE->VectorizedValue = Vec; 10800 } 10801 eraseInstruction(PrevVec); 10802 } 10803 10804 // If the vectorized tree can be rewritten in a smaller type, we truncate the 10805 // vectorized root. InstCombine will then rewrite the entire expression. We 10806 // sign extend the extracted values below. 10807 auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 10808 if (MinBWs.count(ScalarRoot)) { 10809 if (auto *I = dyn_cast<Instruction>(VectorRoot)) { 10810 // If current instr is a phi and not the last phi, insert it after the 10811 // last phi node. 10812 if (isa<PHINode>(I)) 10813 Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt()); 10814 else 10815 Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); 10816 } 10817 auto BundleWidth = VectorizableTree[0]->Scalars.size(); 10818 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 10819 auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); 10820 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); 10821 VectorizableTree[0]->VectorizedValue = Trunc; 10822 } 10823 10824 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() 10825 << " values .\n"); 10826 10827 SmallVector<ShuffledInsertData> ShuffledInserts; 10828 // Maps vector instruction to original insertelement instruction 10829 DenseMap<Value *, InsertElementInst *> VectorToInsertElement; 10830 // Maps extract Scalar to the corresponding extractelement instruction in the 10831 // basic block. Only one extractelement per block should be emitted. 10832 DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs; 10833 // Extract all of the elements with the external uses. 10834 for (const auto &ExternalUse : ExternalUses) { 10835 Value *Scalar = ExternalUse.Scalar; 10836 llvm::User *User = ExternalUse.User; 10837 10838 // Skip users that we already RAUW. This happens when one instruction 10839 // has multiple uses of the same value. 10840 if (User && !is_contained(Scalar->users(), User)) 10841 continue; 10842 TreeEntry *E = getTreeEntry(Scalar); 10843 assert(E && "Invalid scalar"); 10844 assert(E->State != TreeEntry::NeedToGather && 10845 "Extracting from a gather list"); 10846 // Non-instruction pointers are not deleted, just skip them. 10847 if (E->getOpcode() == Instruction::GetElementPtr && 10848 !isa<GetElementPtrInst>(Scalar)) 10849 continue; 10850 10851 Value *Vec = E->VectorizedValue; 10852 assert(Vec && "Can't find vectorizable value"); 10853 10854 Value *Lane = Builder.getInt32(ExternalUse.Lane); 10855 auto ExtractAndExtendIfNeeded = [&](Value *Vec) { 10856 if (Scalar->getType() != Vec->getType()) { 10857 Value *Ex = nullptr; 10858 auto It = ScalarToEEs.find(Scalar); 10859 if (It != ScalarToEEs.end()) { 10860 // No need to emit many extracts, just move the only one in the 10861 // current block. 10862 auto EEIt = It->second.find(Builder.GetInsertBlock()); 10863 if (EEIt != It->second.end()) { 10864 Instruction *I = EEIt->second; 10865 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && 10866 Builder.GetInsertPoint()->comesBefore(I)) 10867 I->moveBefore(&*Builder.GetInsertPoint()); 10868 Ex = I; 10869 } 10870 } 10871 if (!Ex) { 10872 // "Reuse" the existing extract to improve final codegen. 10873 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { 10874 Ex = Builder.CreateExtractElement(ES->getOperand(0), 10875 ES->getOperand(1)); 10876 } else { 10877 Ex = Builder.CreateExtractElement(Vec, Lane); 10878 } 10879 if (auto *I = dyn_cast<Instruction>(Ex)) 10880 ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I); 10881 } 10882 // The then branch of the previous if may produce constants, since 0 10883 // operand might be a constant. 10884 if (auto *ExI = dyn_cast<Instruction>(Ex)) { 10885 GatherShuffleExtractSeq.insert(ExI); 10886 CSEBlocks.insert(ExI->getParent()); 10887 } 10888 // If necessary, sign-extend or zero-extend ScalarRoot 10889 // to the larger type. 10890 if (!MinBWs.count(ScalarRoot)) 10891 return Ex; 10892 if (MinBWs[ScalarRoot].second) 10893 return Builder.CreateSExt(Ex, Scalar->getType()); 10894 return Builder.CreateZExt(Ex, Scalar->getType()); 10895 } 10896 assert(isa<FixedVectorType>(Scalar->getType()) && 10897 isa<InsertElementInst>(Scalar) && 10898 "In-tree scalar of vector type is not insertelement?"); 10899 auto *IE = cast<InsertElementInst>(Scalar); 10900 VectorToInsertElement.try_emplace(Vec, IE); 10901 return Vec; 10902 }; 10903 // If User == nullptr, the Scalar is used as extra arg. Generate 10904 // ExtractElement instruction and update the record for this scalar in 10905 // ExternallyUsedValues. 10906 if (!User) { 10907 assert(ExternallyUsedValues.count(Scalar) && 10908 "Scalar with nullptr as an external user must be registered in " 10909 "ExternallyUsedValues map"); 10910 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 10911 if (auto *PHI = dyn_cast<PHINode>(VecI)) 10912 Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI()); 10913 else 10914 Builder.SetInsertPoint(VecI->getParent(), 10915 std::next(VecI->getIterator())); 10916 } else { 10917 Builder.SetInsertPoint(&F->getEntryBlock().front()); 10918 } 10919 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 10920 // Required to update internally referenced instructions. 10921 Scalar->replaceAllUsesWith(NewInst); 10922 ReplacedExternals.emplace_back(Scalar, NewInst); 10923 continue; 10924 } 10925 10926 if (auto *VU = dyn_cast<InsertElementInst>(User)) { 10927 // Skip if the scalar is another vector op or Vec is not an instruction. 10928 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { 10929 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { 10930 std::optional<unsigned> InsertIdx = getInsertIndex(VU); 10931 if (InsertIdx) { 10932 // Need to use original vector, if the root is truncated. 10933 if (MinBWs.count(Scalar) && 10934 VectorizableTree[0]->VectorizedValue == Vec) 10935 Vec = VectorRoot; 10936 auto *It = 10937 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) { 10938 // Checks if 2 insertelements are from the same buildvector. 10939 InsertElementInst *VecInsert = Data.InsertElements.front(); 10940 return areTwoInsertFromSameBuildVector( 10941 VU, VecInsert, 10942 [](InsertElementInst *II) { return II->getOperand(0); }); 10943 }); 10944 unsigned Idx = *InsertIdx; 10945 if (It == ShuffledInserts.end()) { 10946 (void)ShuffledInserts.emplace_back(); 10947 It = std::next(ShuffledInserts.begin(), 10948 ShuffledInserts.size() - 1); 10949 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; 10950 if (Mask.empty()) 10951 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 10952 // Find the insertvector, vectorized in tree, if any. 10953 Value *Base = VU; 10954 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { 10955 if (IEBase != User && 10956 (!IEBase->hasOneUse() || 10957 getInsertIndex(IEBase).value_or(Idx) == Idx)) 10958 break; 10959 // Build the mask for the vectorized insertelement instructions. 10960 if (const TreeEntry *E = getTreeEntry(IEBase)) { 10961 do { 10962 IEBase = cast<InsertElementInst>(Base); 10963 int IEIdx = *getInsertIndex(IEBase); 10964 assert(Mask[Idx] == PoisonMaskElem && 10965 "InsertElementInstruction used already."); 10966 Mask[IEIdx] = IEIdx; 10967 Base = IEBase->getOperand(0); 10968 } while (E == getTreeEntry(Base)); 10969 break; 10970 } 10971 Base = cast<InsertElementInst>(Base)->getOperand(0); 10972 // After the vectorization the def-use chain has changed, need 10973 // to look through original insertelement instructions, if they 10974 // get replaced by vector instructions. 10975 auto It = VectorToInsertElement.find(Base); 10976 if (It != VectorToInsertElement.end()) 10977 Base = It->second; 10978 } 10979 } 10980 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; 10981 if (Mask.empty()) 10982 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 10983 Mask[Idx] = ExternalUse.Lane; 10984 It->InsertElements.push_back(cast<InsertElementInst>(User)); 10985 continue; 10986 } 10987 } 10988 } 10989 } 10990 10991 // Generate extracts for out-of-tree users. 10992 // Find the insertion point for the extractelement lane. 10993 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 10994 if (PHINode *PH = dyn_cast<PHINode>(User)) { 10995 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 10996 if (PH->getIncomingValue(i) == Scalar) { 10997 Instruction *IncomingTerminator = 10998 PH->getIncomingBlock(i)->getTerminator(); 10999 if (isa<CatchSwitchInst>(IncomingTerminator)) { 11000 Builder.SetInsertPoint(VecI->getParent(), 11001 std::next(VecI->getIterator())); 11002 } else { 11003 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 11004 } 11005 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 11006 PH->setOperand(i, NewInst); 11007 } 11008 } 11009 } else { 11010 Builder.SetInsertPoint(cast<Instruction>(User)); 11011 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 11012 User->replaceUsesOfWith(Scalar, NewInst); 11013 } 11014 } else { 11015 Builder.SetInsertPoint(&F->getEntryBlock().front()); 11016 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 11017 User->replaceUsesOfWith(Scalar, NewInst); 11018 } 11019 11020 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 11021 } 11022 11023 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { 11024 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 11025 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 11026 int VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 11027 for (int I = 0, E = Mask.size(); I < E; ++I) { 11028 if (Mask[I] < VF) 11029 CombinedMask1[I] = Mask[I]; 11030 else 11031 CombinedMask2[I] = Mask[I] - VF; 11032 } 11033 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); 11034 ShuffleBuilder.add(V1, CombinedMask1); 11035 if (V2) 11036 ShuffleBuilder.add(V2, CombinedMask2); 11037 return ShuffleBuilder.finalize(std::nullopt); 11038 }; 11039 11040 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask, 11041 bool ForSingleMask) { 11042 unsigned VF = Mask.size(); 11043 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 11044 if (VF != VecVF) { 11045 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) { 11046 Vec = CreateShuffle(Vec, nullptr, Mask); 11047 return std::make_pair(Vec, true); 11048 } 11049 if (!ForSingleMask) { 11050 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 11051 for (unsigned I = 0; I < VF; ++I) { 11052 if (Mask[I] != PoisonMaskElem) 11053 ResizeMask[Mask[I]] = Mask[I]; 11054 } 11055 Vec = CreateShuffle(Vec, nullptr, ResizeMask); 11056 } 11057 } 11058 11059 return std::make_pair(Vec, false); 11060 }; 11061 // Perform shuffling of the vectorize tree entries for better handling of 11062 // external extracts. 11063 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { 11064 // Find the first and the last instruction in the list of insertelements. 11065 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement); 11066 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front(); 11067 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back(); 11068 Builder.SetInsertPoint(LastInsert); 11069 auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); 11070 Value *NewInst = performExtractsShuffleAction<Value>( 11071 MutableArrayRef(Vector.data(), Vector.size()), 11072 FirstInsert->getOperand(0), 11073 [](Value *Vec) { 11074 return cast<VectorType>(Vec->getType()) 11075 ->getElementCount() 11076 .getKnownMinValue(); 11077 }, 11078 ResizeToVF, 11079 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask, 11080 ArrayRef<Value *> Vals) { 11081 assert((Vals.size() == 1 || Vals.size() == 2) && 11082 "Expected exactly 1 or 2 input values."); 11083 if (Vals.size() == 1) { 11084 // Do not create shuffle if the mask is a simple identity 11085 // non-resizing mask. 11086 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType()) 11087 ->getNumElements() || 11088 !ShuffleVectorInst::isIdentityMask(Mask)) 11089 return CreateShuffle(Vals.front(), nullptr, Mask); 11090 return Vals.front(); 11091 } 11092 return CreateShuffle(Vals.front() ? Vals.front() 11093 : FirstInsert->getOperand(0), 11094 Vals.back(), Mask); 11095 }); 11096 auto It = ShuffledInserts[I].InsertElements.rbegin(); 11097 // Rebuild buildvector chain. 11098 InsertElementInst *II = nullptr; 11099 if (It != ShuffledInserts[I].InsertElements.rend()) 11100 II = *It; 11101 SmallVector<Instruction *> Inserts; 11102 while (It != ShuffledInserts[I].InsertElements.rend()) { 11103 assert(II && "Must be an insertelement instruction."); 11104 if (*It == II) 11105 ++It; 11106 else 11107 Inserts.push_back(cast<Instruction>(II)); 11108 II = dyn_cast<InsertElementInst>(II->getOperand(0)); 11109 } 11110 for (Instruction *II : reverse(Inserts)) { 11111 II->replaceUsesOfWith(II->getOperand(0), NewInst); 11112 if (auto *NewI = dyn_cast<Instruction>(NewInst)) 11113 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI)) 11114 II->moveAfter(NewI); 11115 NewInst = II; 11116 } 11117 LastInsert->replaceAllUsesWith(NewInst); 11118 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) { 11119 IE->replaceUsesOfWith(IE->getOperand(0), 11120 PoisonValue::get(IE->getOperand(0)->getType())); 11121 IE->replaceUsesOfWith(IE->getOperand(1), 11122 PoisonValue::get(IE->getOperand(1)->getType())); 11123 eraseInstruction(IE); 11124 } 11125 CSEBlocks.insert(LastInsert->getParent()); 11126 } 11127 11128 SmallVector<Instruction *> RemovedInsts; 11129 // For each vectorized value: 11130 for (auto &TEPtr : VectorizableTree) { 11131 TreeEntry *Entry = TEPtr.get(); 11132 11133 // No need to handle users of gathered values. 11134 if (Entry->State == TreeEntry::NeedToGather) 11135 continue; 11136 11137 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 11138 11139 // For each lane: 11140 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 11141 Value *Scalar = Entry->Scalars[Lane]; 11142 11143 if (Entry->getOpcode() == Instruction::GetElementPtr && 11144 !isa<GetElementPtrInst>(Scalar)) 11145 continue; 11146 #ifndef NDEBUG 11147 Type *Ty = Scalar->getType(); 11148 if (!Ty->isVoidTy()) { 11149 for (User *U : Scalar->users()) { 11150 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 11151 11152 // It is legal to delete users in the ignorelist. 11153 assert((getTreeEntry(U) || 11154 (UserIgnoreList && UserIgnoreList->contains(U)) || 11155 (isa_and_nonnull<Instruction>(U) && 11156 isDeleted(cast<Instruction>(U)))) && 11157 "Deleting out-of-tree value"); 11158 } 11159 } 11160 #endif 11161 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 11162 eraseInstruction(cast<Instruction>(Scalar)); 11163 // Retain to-be-deleted instructions for some debug-info 11164 // bookkeeping. NOTE: eraseInstruction only marks the instruction for 11165 // deletion - instructions are not deleted until later. 11166 RemovedInsts.push_back(cast<Instruction>(Scalar)); 11167 } 11168 } 11169 11170 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the 11171 // new vector instruction. 11172 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue)) 11173 V->mergeDIAssignID(RemovedInsts); 11174 11175 Builder.ClearInsertionPoint(); 11176 InstrElementSize.clear(); 11177 11178 return VectorizableTree[0]->VectorizedValue; 11179 } 11180 11181 void BoUpSLP::optimizeGatherSequence() { 11182 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size() 11183 << " gather sequences instructions.\n"); 11184 // LICM InsertElementInst sequences. 11185 for (Instruction *I : GatherShuffleExtractSeq) { 11186 if (isDeleted(I)) 11187 continue; 11188 11189 // Check if this block is inside a loop. 11190 Loop *L = LI->getLoopFor(I->getParent()); 11191 if (!L) 11192 continue; 11193 11194 // Check if it has a preheader. 11195 BasicBlock *PreHeader = L->getLoopPreheader(); 11196 if (!PreHeader) 11197 continue; 11198 11199 // If the vector or the element that we insert into it are 11200 // instructions that are defined in this basic block then we can't 11201 // hoist this instruction. 11202 if (any_of(I->operands(), [L](Value *V) { 11203 auto *OpI = dyn_cast<Instruction>(V); 11204 return OpI && L->contains(OpI); 11205 })) 11206 continue; 11207 11208 // We can hoist this instruction. Move it to the pre-header. 11209 I->moveBefore(PreHeader->getTerminator()); 11210 CSEBlocks.insert(PreHeader); 11211 } 11212 11213 // Make a list of all reachable blocks in our CSE queue. 11214 SmallVector<const DomTreeNode *, 8> CSEWorkList; 11215 CSEWorkList.reserve(CSEBlocks.size()); 11216 for (BasicBlock *BB : CSEBlocks) 11217 if (DomTreeNode *N = DT->getNode(BB)) { 11218 assert(DT->isReachableFromEntry(N)); 11219 CSEWorkList.push_back(N); 11220 } 11221 11222 // Sort blocks by domination. This ensures we visit a block after all blocks 11223 // dominating it are visited. 11224 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { 11225 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && 11226 "Different nodes should have different DFS numbers"); 11227 return A->getDFSNumIn() < B->getDFSNumIn(); 11228 }); 11229 11230 // Less defined shuffles can be replaced by the more defined copies. 11231 // Between two shuffles one is less defined if it has the same vector operands 11232 // and its mask indeces are the same as in the first one or undefs. E.g. 11233 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 11234 // poison, <0, 0, 0, 0>. 11235 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, 11236 SmallVectorImpl<int> &NewMask) { 11237 if (I1->getType() != I2->getType()) 11238 return false; 11239 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); 11240 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); 11241 if (!SI1 || !SI2) 11242 return I1->isIdenticalTo(I2); 11243 if (SI1->isIdenticalTo(SI2)) 11244 return true; 11245 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) 11246 if (SI1->getOperand(I) != SI2->getOperand(I)) 11247 return false; 11248 // Check if the second instruction is more defined than the first one. 11249 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); 11250 ArrayRef<int> SM1 = SI1->getShuffleMask(); 11251 // Count trailing undefs in the mask to check the final number of used 11252 // registers. 11253 unsigned LastUndefsCnt = 0; 11254 for (int I = 0, E = NewMask.size(); I < E; ++I) { 11255 if (SM1[I] == PoisonMaskElem) 11256 ++LastUndefsCnt; 11257 else 11258 LastUndefsCnt = 0; 11259 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem && 11260 NewMask[I] != SM1[I]) 11261 return false; 11262 if (NewMask[I] == PoisonMaskElem) 11263 NewMask[I] = SM1[I]; 11264 } 11265 // Check if the last undefs actually change the final number of used vector 11266 // registers. 11267 return SM1.size() - LastUndefsCnt > 1 && 11268 TTI->getNumberOfParts(SI1->getType()) == 11269 TTI->getNumberOfParts( 11270 FixedVectorType::get(SI1->getType()->getElementType(), 11271 SM1.size() - LastUndefsCnt)); 11272 }; 11273 // Perform O(N^2) search over the gather/shuffle sequences and merge identical 11274 // instructions. TODO: We can further optimize this scan if we split the 11275 // instructions into different buckets based on the insert lane. 11276 SmallVector<Instruction *, 16> Visited; 11277 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 11278 assert(*I && 11279 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 11280 "Worklist not sorted properly!"); 11281 BasicBlock *BB = (*I)->getBlock(); 11282 // For all instructions in blocks containing gather sequences: 11283 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 11284 if (isDeleted(&In)) 11285 continue; 11286 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) && 11287 !GatherShuffleExtractSeq.contains(&In)) 11288 continue; 11289 11290 // Check if we can replace this instruction with any of the 11291 // visited instructions. 11292 bool Replaced = false; 11293 for (Instruction *&V : Visited) { 11294 SmallVector<int> NewMask; 11295 if (IsIdenticalOrLessDefined(&In, V, NewMask) && 11296 DT->dominates(V->getParent(), In.getParent())) { 11297 In.replaceAllUsesWith(V); 11298 eraseInstruction(&In); 11299 if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) 11300 if (!NewMask.empty()) 11301 SI->setShuffleMask(NewMask); 11302 Replaced = true; 11303 break; 11304 } 11305 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && 11306 GatherShuffleExtractSeq.contains(V) && 11307 IsIdenticalOrLessDefined(V, &In, NewMask) && 11308 DT->dominates(In.getParent(), V->getParent())) { 11309 In.moveAfter(V); 11310 V->replaceAllUsesWith(&In); 11311 eraseInstruction(V); 11312 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) 11313 if (!NewMask.empty()) 11314 SI->setShuffleMask(NewMask); 11315 V = &In; 11316 Replaced = true; 11317 break; 11318 } 11319 } 11320 if (!Replaced) { 11321 assert(!is_contained(Visited, &In)); 11322 Visited.push_back(&In); 11323 } 11324 } 11325 } 11326 CSEBlocks.clear(); 11327 GatherShuffleExtractSeq.clear(); 11328 } 11329 11330 BoUpSLP::ScheduleData * 11331 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { 11332 ScheduleData *Bundle = nullptr; 11333 ScheduleData *PrevInBundle = nullptr; 11334 for (Value *V : VL) { 11335 if (doesNotNeedToBeScheduled(V)) 11336 continue; 11337 ScheduleData *BundleMember = getScheduleData(V); 11338 assert(BundleMember && 11339 "no ScheduleData for bundle member " 11340 "(maybe not in same basic block)"); 11341 assert(BundleMember->isSchedulingEntity() && 11342 "bundle member already part of other bundle"); 11343 if (PrevInBundle) { 11344 PrevInBundle->NextInBundle = BundleMember; 11345 } else { 11346 Bundle = BundleMember; 11347 } 11348 11349 // Group the instructions to a bundle. 11350 BundleMember->FirstInBundle = Bundle; 11351 PrevInBundle = BundleMember; 11352 } 11353 assert(Bundle && "Failed to find schedule bundle"); 11354 return Bundle; 11355 } 11356 11357 // Groups the instructions to a bundle (which is then a single scheduling entity) 11358 // and schedules instructions until the bundle gets ready. 11359 std::optional<BoUpSLP::ScheduleData *> 11360 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 11361 const InstructionsState &S) { 11362 // No need to schedule PHIs, insertelement, extractelement and extractvalue 11363 // instructions. 11364 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || 11365 doesNotNeedToSchedule(VL)) 11366 return nullptr; 11367 11368 // Initialize the instruction bundle. 11369 Instruction *OldScheduleEnd = ScheduleEnd; 11370 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); 11371 11372 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, 11373 ScheduleData *Bundle) { 11374 // The scheduling region got new instructions at the lower end (or it is a 11375 // new region for the first bundle). This makes it necessary to 11376 // recalculate all dependencies. 11377 // It is seldom that this needs to be done a second time after adding the 11378 // initial bundle to the region. 11379 if (ScheduleEnd != OldScheduleEnd) { 11380 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) 11381 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); 11382 ReSchedule = true; 11383 } 11384 if (Bundle) { 11385 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle 11386 << " in block " << BB->getName() << "\n"); 11387 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); 11388 } 11389 11390 if (ReSchedule) { 11391 resetSchedule(); 11392 initialFillReadyList(ReadyInsts); 11393 } 11394 11395 // Now try to schedule the new bundle or (if no bundle) just calculate 11396 // dependencies. As soon as the bundle is "ready" it means that there are no 11397 // cyclic dependencies and we can schedule it. Note that's important that we 11398 // don't "schedule" the bundle yet (see cancelScheduling). 11399 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && 11400 !ReadyInsts.empty()) { 11401 ScheduleData *Picked = ReadyInsts.pop_back_val(); 11402 assert(Picked->isSchedulingEntity() && Picked->isReady() && 11403 "must be ready to schedule"); 11404 schedule(Picked, ReadyInsts); 11405 } 11406 }; 11407 11408 // Make sure that the scheduling region contains all 11409 // instructions of the bundle. 11410 for (Value *V : VL) { 11411 if (doesNotNeedToBeScheduled(V)) 11412 continue; 11413 if (!extendSchedulingRegion(V, S)) { 11414 // If the scheduling region got new instructions at the lower end (or it 11415 // is a new region for the first bundle). This makes it necessary to 11416 // recalculate all dependencies. 11417 // Otherwise the compiler may crash trying to incorrectly calculate 11418 // dependencies and emit instruction in the wrong order at the actual 11419 // scheduling. 11420 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); 11421 return std::nullopt; 11422 } 11423 } 11424 11425 bool ReSchedule = false; 11426 for (Value *V : VL) { 11427 if (doesNotNeedToBeScheduled(V)) 11428 continue; 11429 ScheduleData *BundleMember = getScheduleData(V); 11430 assert(BundleMember && 11431 "no ScheduleData for bundle member (maybe not in same basic block)"); 11432 11433 // Make sure we don't leave the pieces of the bundle in the ready list when 11434 // whole bundle might not be ready. 11435 ReadyInsts.remove(BundleMember); 11436 11437 if (!BundleMember->IsScheduled) 11438 continue; 11439 // A bundle member was scheduled as single instruction before and now 11440 // needs to be scheduled as part of the bundle. We just get rid of the 11441 // existing schedule. 11442 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 11443 << " was already scheduled\n"); 11444 ReSchedule = true; 11445 } 11446 11447 auto *Bundle = buildBundle(VL); 11448 TryScheduleBundleImpl(ReSchedule, Bundle); 11449 if (!Bundle->isReady()) { 11450 cancelScheduling(VL, S.OpValue); 11451 return std::nullopt; 11452 } 11453 return Bundle; 11454 } 11455 11456 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, 11457 Value *OpValue) { 11458 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) || 11459 doesNotNeedToSchedule(VL)) 11460 return; 11461 11462 if (doesNotNeedToBeScheduled(OpValue)) 11463 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); 11464 ScheduleData *Bundle = getScheduleData(OpValue); 11465 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 11466 assert(!Bundle->IsScheduled && 11467 "Can't cancel bundle which is already scheduled"); 11468 assert(Bundle->isSchedulingEntity() && 11469 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && 11470 "tried to unbundle something which is not a bundle"); 11471 11472 // Remove the bundle from the ready list. 11473 if (Bundle->isReady()) 11474 ReadyInsts.remove(Bundle); 11475 11476 // Un-bundle: make single instructions out of the bundle. 11477 ScheduleData *BundleMember = Bundle; 11478 while (BundleMember) { 11479 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 11480 BundleMember->FirstInBundle = BundleMember; 11481 ScheduleData *Next = BundleMember->NextInBundle; 11482 BundleMember->NextInBundle = nullptr; 11483 BundleMember->TE = nullptr; 11484 if (BundleMember->unscheduledDepsInBundle() == 0) { 11485 ReadyInsts.insert(BundleMember); 11486 } 11487 BundleMember = Next; 11488 } 11489 } 11490 11491 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { 11492 // Allocate a new ScheduleData for the instruction. 11493 if (ChunkPos >= ChunkSize) { 11494 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); 11495 ChunkPos = 0; 11496 } 11497 return &(ScheduleDataChunks.back()[ChunkPos++]); 11498 } 11499 11500 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, 11501 const InstructionsState &S) { 11502 if (getScheduleData(V, isOneOf(S, V))) 11503 return true; 11504 Instruction *I = dyn_cast<Instruction>(V); 11505 assert(I && "bundle member must be an instruction"); 11506 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && 11507 !doesNotNeedToBeScheduled(I) && 11508 "phi nodes/insertelements/extractelements/extractvalues don't need to " 11509 "be scheduled"); 11510 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { 11511 ScheduleData *ISD = getScheduleData(I); 11512 if (!ISD) 11513 return false; 11514 assert(isInSchedulingRegion(ISD) && 11515 "ScheduleData not in scheduling region"); 11516 ScheduleData *SD = allocateScheduleDataChunks(); 11517 SD->Inst = I; 11518 SD->init(SchedulingRegionID, S.OpValue); 11519 ExtraScheduleDataMap[I][S.OpValue] = SD; 11520 return true; 11521 }; 11522 if (CheckScheduleForI(I)) 11523 return true; 11524 if (!ScheduleStart) { 11525 // It's the first instruction in the new region. 11526 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 11527 ScheduleStart = I; 11528 ScheduleEnd = I->getNextNode(); 11529 if (isOneOf(S, I) != I) 11530 CheckScheduleForI(I); 11531 assert(ScheduleEnd && "tried to vectorize a terminator?"); 11532 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 11533 return true; 11534 } 11535 // Search up and down at the same time, because we don't know if the new 11536 // instruction is above or below the existing scheduling region. 11537 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted 11538 // against the budget. Otherwise debug info could affect codegen. 11539 BasicBlock::reverse_iterator UpIter = 11540 ++ScheduleStart->getIterator().getReverse(); 11541 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 11542 BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); 11543 BasicBlock::iterator LowerEnd = BB->end(); 11544 auto IsAssumeLikeIntr = [](const Instruction &I) { 11545 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 11546 return II->isAssumeLikeIntrinsic(); 11547 return false; 11548 }; 11549 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 11550 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 11551 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && 11552 &*DownIter != I) { 11553 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 11554 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 11555 return false; 11556 } 11557 11558 ++UpIter; 11559 ++DownIter; 11560 11561 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 11562 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 11563 } 11564 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { 11565 assert(I->getParent() == ScheduleStart->getParent() && 11566 "Instruction is in wrong basic block."); 11567 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 11568 ScheduleStart = I; 11569 if (isOneOf(S, I) != I) 11570 CheckScheduleForI(I); 11571 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I 11572 << "\n"); 11573 return true; 11574 } 11575 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && 11576 "Expected to reach top of the basic block or instruction down the " 11577 "lower end."); 11578 assert(I->getParent() == ScheduleEnd->getParent() && 11579 "Instruction is in wrong basic block."); 11580 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 11581 nullptr); 11582 ScheduleEnd = I->getNextNode(); 11583 if (isOneOf(S, I) != I) 11584 CheckScheduleForI(I); 11585 assert(ScheduleEnd && "tried to vectorize a terminator?"); 11586 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 11587 return true; 11588 } 11589 11590 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 11591 Instruction *ToI, 11592 ScheduleData *PrevLoadStore, 11593 ScheduleData *NextLoadStore) { 11594 ScheduleData *CurrentLoadStore = PrevLoadStore; 11595 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 11596 // No need to allocate data for non-schedulable instructions. 11597 if (doesNotNeedToBeScheduled(I)) 11598 continue; 11599 ScheduleData *SD = ScheduleDataMap.lookup(I); 11600 if (!SD) { 11601 SD = allocateScheduleDataChunks(); 11602 ScheduleDataMap[I] = SD; 11603 SD->Inst = I; 11604 } 11605 assert(!isInSchedulingRegion(SD) && 11606 "new ScheduleData already in scheduling region"); 11607 SD->init(SchedulingRegionID, I); 11608 11609 if (I->mayReadOrWriteMemory() && 11610 (!isa<IntrinsicInst>(I) || 11611 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && 11612 cast<IntrinsicInst>(I)->getIntrinsicID() != 11613 Intrinsic::pseudoprobe))) { 11614 // Update the linked list of memory accessing instructions. 11615 if (CurrentLoadStore) { 11616 CurrentLoadStore->NextLoadStore = SD; 11617 } else { 11618 FirstLoadStoreInRegion = SD; 11619 } 11620 CurrentLoadStore = SD; 11621 } 11622 11623 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 11624 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 11625 RegionHasStackSave = true; 11626 } 11627 if (NextLoadStore) { 11628 if (CurrentLoadStore) 11629 CurrentLoadStore->NextLoadStore = NextLoadStore; 11630 } else { 11631 LastLoadStoreInRegion = CurrentLoadStore; 11632 } 11633 } 11634 11635 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 11636 bool InsertInReadyList, 11637 BoUpSLP *SLP) { 11638 assert(SD->isSchedulingEntity()); 11639 11640 SmallVector<ScheduleData *, 10> WorkList; 11641 WorkList.push_back(SD); 11642 11643 while (!WorkList.empty()) { 11644 ScheduleData *SD = WorkList.pop_back_val(); 11645 for (ScheduleData *BundleMember = SD; BundleMember; 11646 BundleMember = BundleMember->NextInBundle) { 11647 assert(isInSchedulingRegion(BundleMember)); 11648 if (BundleMember->hasValidDependencies()) 11649 continue; 11650 11651 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember 11652 << "\n"); 11653 BundleMember->Dependencies = 0; 11654 BundleMember->resetUnscheduledDeps(); 11655 11656 // Handle def-use chain dependencies. 11657 if (BundleMember->OpValue != BundleMember->Inst) { 11658 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) { 11659 BundleMember->Dependencies++; 11660 ScheduleData *DestBundle = UseSD->FirstInBundle; 11661 if (!DestBundle->IsScheduled) 11662 BundleMember->incrementUnscheduledDeps(1); 11663 if (!DestBundle->hasValidDependencies()) 11664 WorkList.push_back(DestBundle); 11665 } 11666 } else { 11667 for (User *U : BundleMember->Inst->users()) { 11668 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) { 11669 BundleMember->Dependencies++; 11670 ScheduleData *DestBundle = UseSD->FirstInBundle; 11671 if (!DestBundle->IsScheduled) 11672 BundleMember->incrementUnscheduledDeps(1); 11673 if (!DestBundle->hasValidDependencies()) 11674 WorkList.push_back(DestBundle); 11675 } 11676 } 11677 } 11678 11679 auto makeControlDependent = [&](Instruction *I) { 11680 auto *DepDest = getScheduleData(I); 11681 assert(DepDest && "must be in schedule window"); 11682 DepDest->ControlDependencies.push_back(BundleMember); 11683 BundleMember->Dependencies++; 11684 ScheduleData *DestBundle = DepDest->FirstInBundle; 11685 if (!DestBundle->IsScheduled) 11686 BundleMember->incrementUnscheduledDeps(1); 11687 if (!DestBundle->hasValidDependencies()) 11688 WorkList.push_back(DestBundle); 11689 }; 11690 11691 // Any instruction which isn't safe to speculate at the beginning of the 11692 // block is control dependend on any early exit or non-willreturn call 11693 // which proceeds it. 11694 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { 11695 for (Instruction *I = BundleMember->Inst->getNextNode(); 11696 I != ScheduleEnd; I = I->getNextNode()) { 11697 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC)) 11698 continue; 11699 11700 // Add the dependency 11701 makeControlDependent(I); 11702 11703 if (!isGuaranteedToTransferExecutionToSuccessor(I)) 11704 // Everything past here must be control dependent on I. 11705 break; 11706 } 11707 } 11708 11709 if (RegionHasStackSave) { 11710 // If we have an inalloc alloca instruction, it needs to be scheduled 11711 // after any preceeding stacksave. We also need to prevent any alloca 11712 // from reordering above a preceeding stackrestore. 11713 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) || 11714 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) { 11715 for (Instruction *I = BundleMember->Inst->getNextNode(); 11716 I != ScheduleEnd; I = I->getNextNode()) { 11717 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 11718 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 11719 // Any allocas past here must be control dependent on I, and I 11720 // must be memory dependend on BundleMember->Inst. 11721 break; 11722 11723 if (!isa<AllocaInst>(I)) 11724 continue; 11725 11726 // Add the dependency 11727 makeControlDependent(I); 11728 } 11729 } 11730 11731 // In addition to the cases handle just above, we need to prevent 11732 // allocas and loads/stores from moving below a stacksave or a 11733 // stackrestore. Avoiding moving allocas below stackrestore is currently 11734 // thought to be conservatism. Moving loads/stores below a stackrestore 11735 // can lead to incorrect code. 11736 if (isa<AllocaInst>(BundleMember->Inst) || 11737 BundleMember->Inst->mayReadOrWriteMemory()) { 11738 for (Instruction *I = BundleMember->Inst->getNextNode(); 11739 I != ScheduleEnd; I = I->getNextNode()) { 11740 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) && 11741 !match(I, m_Intrinsic<Intrinsic::stackrestore>())) 11742 continue; 11743 11744 // Add the dependency 11745 makeControlDependent(I); 11746 break; 11747 } 11748 } 11749 } 11750 11751 // Handle the memory dependencies (if any). 11752 ScheduleData *DepDest = BundleMember->NextLoadStore; 11753 if (!DepDest) 11754 continue; 11755 Instruction *SrcInst = BundleMember->Inst; 11756 assert(SrcInst->mayReadOrWriteMemory() && 11757 "NextLoadStore list for non memory effecting bundle?"); 11758 MemoryLocation SrcLoc = getLocation(SrcInst); 11759 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 11760 unsigned numAliased = 0; 11761 unsigned DistToSrc = 1; 11762 11763 for (; DepDest; DepDest = DepDest->NextLoadStore) { 11764 assert(isInSchedulingRegion(DepDest)); 11765 11766 // We have two limits to reduce the complexity: 11767 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 11768 // SLP->isAliased (which is the expensive part in this loop). 11769 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 11770 // the whole loop (even if the loop is fast, it's quadratic). 11771 // It's important for the loop break condition (see below) to 11772 // check this limit even between two read-only instructions. 11773 if (DistToSrc >= MaxMemDepDistance || 11774 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 11775 (numAliased >= AliasedCheckLimit || 11776 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 11777 11778 // We increment the counter only if the locations are aliased 11779 // (instead of counting all alias checks). This gives a better 11780 // balance between reduced runtime and accurate dependencies. 11781 numAliased++; 11782 11783 DepDest->MemoryDependencies.push_back(BundleMember); 11784 BundleMember->Dependencies++; 11785 ScheduleData *DestBundle = DepDest->FirstInBundle; 11786 if (!DestBundle->IsScheduled) { 11787 BundleMember->incrementUnscheduledDeps(1); 11788 } 11789 if (!DestBundle->hasValidDependencies()) { 11790 WorkList.push_back(DestBundle); 11791 } 11792 } 11793 11794 // Example, explaining the loop break condition: Let's assume our 11795 // starting instruction is i0 and MaxMemDepDistance = 3. 11796 // 11797 // +--------v--v--v 11798 // i0,i1,i2,i3,i4,i5,i6,i7,i8 11799 // +--------^--^--^ 11800 // 11801 // MaxMemDepDistance let us stop alias-checking at i3 and we add 11802 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 11803 // Previously we already added dependencies from i3 to i6,i7,i8 11804 // (because of MaxMemDepDistance). As we added a dependency from 11805 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 11806 // and we can abort this loop at i6. 11807 if (DistToSrc >= 2 * MaxMemDepDistance) 11808 break; 11809 DistToSrc++; 11810 } 11811 } 11812 if (InsertInReadyList && SD->isReady()) { 11813 ReadyInsts.insert(SD); 11814 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst 11815 << "\n"); 11816 } 11817 } 11818 } 11819 11820 void BoUpSLP::BlockScheduling::resetSchedule() { 11821 assert(ScheduleStart && 11822 "tried to reset schedule on block which has not been scheduled"); 11823 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 11824 doForAllOpcodes(I, [&](ScheduleData *SD) { 11825 assert(isInSchedulingRegion(SD) && 11826 "ScheduleData not in scheduling region"); 11827 SD->IsScheduled = false; 11828 SD->resetUnscheduledDeps(); 11829 }); 11830 } 11831 ReadyInsts.clear(); 11832 } 11833 11834 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 11835 if (!BS->ScheduleStart) 11836 return; 11837 11838 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 11839 11840 // A key point - if we got here, pre-scheduling was able to find a valid 11841 // scheduling of the sub-graph of the scheduling window which consists 11842 // of all vector bundles and their transitive users. As such, we do not 11843 // need to reschedule anything *outside of* that subgraph. 11844 11845 BS->resetSchedule(); 11846 11847 // For the real scheduling we use a more sophisticated ready-list: it is 11848 // sorted by the original instruction location. This lets the final schedule 11849 // be as close as possible to the original instruction order. 11850 // WARNING: If changing this order causes a correctness issue, that means 11851 // there is some missing dependence edge in the schedule data graph. 11852 struct ScheduleDataCompare { 11853 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { 11854 return SD2->SchedulingPriority < SD1->SchedulingPriority; 11855 } 11856 }; 11857 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 11858 11859 // Ensure that all dependency data is updated (for nodes in the sub-graph) 11860 // and fill the ready-list with initial instructions. 11861 int Idx = 0; 11862 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 11863 I = I->getNextNode()) { 11864 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) { 11865 TreeEntry *SDTE = getTreeEntry(SD->Inst); 11866 (void)SDTE; 11867 assert((isVectorLikeInstWithConstOps(SD->Inst) || 11868 SD->isPartOfBundle() == 11869 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && 11870 "scheduler and vectorizer bundle mismatch"); 11871 SD->FirstInBundle->SchedulingPriority = Idx++; 11872 11873 if (SD->isSchedulingEntity() && SD->isPartOfBundle()) 11874 BS->calculateDependencies(SD, false, this); 11875 }); 11876 } 11877 BS->initialFillReadyList(ReadyInsts); 11878 11879 Instruction *LastScheduledInst = BS->ScheduleEnd; 11880 11881 // Do the "real" scheduling. 11882 while (!ReadyInsts.empty()) { 11883 ScheduleData *picked = *ReadyInsts.begin(); 11884 ReadyInsts.erase(ReadyInsts.begin()); 11885 11886 // Move the scheduled instruction(s) to their dedicated places, if not 11887 // there yet. 11888 for (ScheduleData *BundleMember = picked; BundleMember; 11889 BundleMember = BundleMember->NextInBundle) { 11890 Instruction *pickedInst = BundleMember->Inst; 11891 if (pickedInst->getNextNode() != LastScheduledInst) 11892 pickedInst->moveBefore(LastScheduledInst); 11893 LastScheduledInst = pickedInst; 11894 } 11895 11896 BS->schedule(picked, ReadyInsts); 11897 } 11898 11899 // Check that we didn't break any of our invariants. 11900 #ifdef EXPENSIVE_CHECKS 11901 BS->verify(); 11902 #endif 11903 11904 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 11905 // Check that all schedulable entities got scheduled 11906 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { 11907 BS->doForAllOpcodes(I, [&](ScheduleData *SD) { 11908 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) { 11909 assert(SD->IsScheduled && "must be scheduled at this point"); 11910 } 11911 }); 11912 } 11913 #endif 11914 11915 // Avoid duplicate scheduling of the block. 11916 BS->ScheduleStart = nullptr; 11917 } 11918 11919 unsigned BoUpSLP::getVectorElementSize(Value *V) { 11920 // If V is a store, just return the width of the stored value (or value 11921 // truncated just before storing) without traversing the expression tree. 11922 // This is the common case. 11923 if (auto *Store = dyn_cast<StoreInst>(V)) 11924 return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); 11925 11926 if (auto *IEI = dyn_cast<InsertElementInst>(V)) 11927 return getVectorElementSize(IEI->getOperand(1)); 11928 11929 auto E = InstrElementSize.find(V); 11930 if (E != InstrElementSize.end()) 11931 return E->second; 11932 11933 // If V is not a store, we can traverse the expression tree to find loads 11934 // that feed it. The type of the loaded value may indicate a more suitable 11935 // width than V's type. We want to base the vector element size on the width 11936 // of memory operations where possible. 11937 SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist; 11938 SmallPtrSet<Instruction *, 16> Visited; 11939 if (auto *I = dyn_cast<Instruction>(V)) { 11940 Worklist.emplace_back(I, I->getParent()); 11941 Visited.insert(I); 11942 } 11943 11944 // Traverse the expression tree in bottom-up order looking for loads. If we 11945 // encounter an instruction we don't yet handle, we give up. 11946 auto Width = 0u; 11947 while (!Worklist.empty()) { 11948 Instruction *I; 11949 BasicBlock *Parent; 11950 std::tie(I, Parent) = Worklist.pop_back_val(); 11951 11952 // We should only be looking at scalar instructions here. If the current 11953 // instruction has a vector type, skip. 11954 auto *Ty = I->getType(); 11955 if (isa<VectorType>(Ty)) 11956 continue; 11957 11958 // If the current instruction is a load, update MaxWidth to reflect the 11959 // width of the loaded value. 11960 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I)) 11961 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); 11962 11963 // Otherwise, we need to visit the operands of the instruction. We only 11964 // handle the interesting cases from buildTree here. If an operand is an 11965 // instruction we haven't yet visited and from the same basic block as the 11966 // user or the use is a PHI node, we add it to the worklist. 11967 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst, 11968 BinaryOperator, UnaryOperator>(I)) { 11969 for (Use &U : I->operands()) 11970 if (auto *J = dyn_cast<Instruction>(U.get())) 11971 if (Visited.insert(J).second && 11972 (isa<PHINode>(I) || J->getParent() == Parent)) 11973 Worklist.emplace_back(J, J->getParent()); 11974 } else { 11975 break; 11976 } 11977 } 11978 11979 // If we didn't encounter a memory access in the expression tree, or if we 11980 // gave up for some reason, just return the width of V. Otherwise, return the 11981 // maximum width we found. 11982 if (!Width) { 11983 if (auto *CI = dyn_cast<CmpInst>(V)) 11984 V = CI->getOperand(0); 11985 Width = DL->getTypeSizeInBits(V->getType()); 11986 } 11987 11988 for (Instruction *I : Visited) 11989 InstrElementSize[I] = Width; 11990 11991 return Width; 11992 } 11993 11994 // Determine if a value V in a vectorizable expression Expr can be demoted to a 11995 // smaller type with a truncation. We collect the values that will be demoted 11996 // in ToDemote and additional roots that require investigating in Roots. 11997 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, 11998 SmallVectorImpl<Value *> &ToDemote, 11999 SmallVectorImpl<Value *> &Roots) { 12000 // We can always demote constants. 12001 if (isa<Constant>(V)) { 12002 ToDemote.push_back(V); 12003 return true; 12004 } 12005 12006 // If the value is not an instruction in the expression with only one use, it 12007 // cannot be demoted. 12008 auto *I = dyn_cast<Instruction>(V); 12009 if (!I || !I->hasOneUse() || !Expr.count(I)) 12010 return false; 12011 12012 switch (I->getOpcode()) { 12013 12014 // We can always demote truncations and extensions. Since truncations can 12015 // seed additional demotion, we save the truncated value. 12016 case Instruction::Trunc: 12017 Roots.push_back(I->getOperand(0)); 12018 break; 12019 case Instruction::ZExt: 12020 case Instruction::SExt: 12021 if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0))) 12022 return false; 12023 break; 12024 12025 // We can demote certain binary operations if we can demote both of their 12026 // operands. 12027 case Instruction::Add: 12028 case Instruction::Sub: 12029 case Instruction::Mul: 12030 case Instruction::And: 12031 case Instruction::Or: 12032 case Instruction::Xor: 12033 if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || 12034 !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) 12035 return false; 12036 break; 12037 12038 // We can demote selects if we can demote their true and false values. 12039 case Instruction::Select: { 12040 SelectInst *SI = cast<SelectInst>(I); 12041 if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || 12042 !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) 12043 return false; 12044 break; 12045 } 12046 12047 // We can demote phis if we can demote all their incoming operands. Note that 12048 // we don't need to worry about cycles since we ensure single use above. 12049 case Instruction::PHI: { 12050 PHINode *PN = cast<PHINode>(I); 12051 for (Value *IncValue : PN->incoming_values()) 12052 if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) 12053 return false; 12054 break; 12055 } 12056 12057 // Otherwise, conservatively give up. 12058 default: 12059 return false; 12060 } 12061 12062 // Record the value that we can demote. 12063 ToDemote.push_back(V); 12064 return true; 12065 } 12066 12067 void BoUpSLP::computeMinimumValueSizes() { 12068 // If there are no external uses, the expression tree must be rooted by a 12069 // store. We can't demote in-memory values, so there is nothing to do here. 12070 if (ExternalUses.empty()) 12071 return; 12072 12073 // We only attempt to truncate integer expressions. 12074 auto &TreeRoot = VectorizableTree[0]->Scalars; 12075 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); 12076 if (!TreeRootIT) 12077 return; 12078 12079 // If the expression is not rooted by a store, these roots should have 12080 // external uses. We will rely on InstCombine to rewrite the expression in 12081 // the narrower type. However, InstCombine only rewrites single-use values. 12082 // This means that if a tree entry other than a root is used externally, it 12083 // must have multiple uses and InstCombine will not rewrite it. The code 12084 // below ensures that only the roots are used externally. 12085 SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); 12086 for (auto &EU : ExternalUses) 12087 if (!Expr.erase(EU.Scalar)) 12088 return; 12089 if (!Expr.empty()) 12090 return; 12091 12092 // Collect the scalar values of the vectorizable expression. We will use this 12093 // context to determine which values can be demoted. If we see a truncation, 12094 // we mark it as seeding another demotion. 12095 for (auto &EntryPtr : VectorizableTree) 12096 Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); 12097 12098 // Ensure the roots of the vectorizable tree don't form a cycle. They must 12099 // have a single external user that is not in the vectorizable tree. 12100 for (auto *Root : TreeRoot) 12101 if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) 12102 return; 12103 12104 // Conservatively determine if we can actually truncate the roots of the 12105 // expression. Collect the values that can be demoted in ToDemote and 12106 // additional roots that require investigating in Roots. 12107 SmallVector<Value *, 32> ToDemote; 12108 SmallVector<Value *, 4> Roots; 12109 for (auto *Root : TreeRoot) 12110 if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) 12111 return; 12112 12113 // The maximum bit width required to represent all the values that can be 12114 // demoted without loss of precision. It would be safe to truncate the roots 12115 // of the expression to this width. 12116 auto MaxBitWidth = 8u; 12117 12118 // We first check if all the bits of the roots are demanded. If they're not, 12119 // we can truncate the roots to this narrower type. 12120 for (auto *Root : TreeRoot) { 12121 auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); 12122 MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(), 12123 MaxBitWidth); 12124 } 12125 12126 // True if the roots can be zero-extended back to their original type, rather 12127 // than sign-extended. We know that if the leading bits are not demanded, we 12128 // can safely zero-extend. So we initialize IsKnownPositive to True. 12129 bool IsKnownPositive = true; 12130 12131 // If all the bits of the roots are demanded, we can try a little harder to 12132 // compute a narrower type. This can happen, for example, if the roots are 12133 // getelementptr indices. InstCombine promotes these indices to the pointer 12134 // width. Thus, all their bits are technically demanded even though the 12135 // address computation might be vectorized in a smaller type. 12136 // 12137 // We start by looking at each entry that can be demoted. We compute the 12138 // maximum bit width required to store the scalar by using ValueTracking to 12139 // compute the number of high-order bits we can truncate. 12140 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && 12141 llvm::all_of(TreeRoot, [](Value *R) { 12142 assert(R->hasOneUse() && "Root should have only one use!"); 12143 return isa<GetElementPtrInst>(R->user_back()); 12144 })) { 12145 MaxBitWidth = 8u; 12146 12147 // Determine if the sign bit of all the roots is known to be zero. If not, 12148 // IsKnownPositive is set to False. 12149 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { 12150 KnownBits Known = computeKnownBits(R, *DL); 12151 return Known.isNonNegative(); 12152 }); 12153 12154 // Determine the maximum number of bits required to store the scalar 12155 // values. 12156 for (auto *Scalar : ToDemote) { 12157 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); 12158 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); 12159 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); 12160 } 12161 12162 // If we can't prove that the sign bit is zero, we must add one to the 12163 // maximum bit width to account for the unknown sign bit. This preserves 12164 // the existing sign bit so we can safely sign-extend the root back to the 12165 // original type. Otherwise, if we know the sign bit is zero, we will 12166 // zero-extend the root instead. 12167 // 12168 // FIXME: This is somewhat suboptimal, as there will be cases where adding 12169 // one to the maximum bit width will yield a larger-than-necessary 12170 // type. In general, we need to add an extra bit only if we can't 12171 // prove that the upper bit of the original type is equal to the 12172 // upper bit of the proposed smaller type. If these two bits are the 12173 // same (either zero or one) we know that sign-extending from the 12174 // smaller type will result in the same value. Here, since we can't 12175 // yet prove this, we are just making the proposed smaller type 12176 // larger to ensure correctness. 12177 if (!IsKnownPositive) 12178 ++MaxBitWidth; 12179 } 12180 12181 // Round MaxBitWidth up to the next power-of-two. 12182 MaxBitWidth = llvm::bit_ceil(MaxBitWidth); 12183 12184 // If the maximum bit width we compute is less than the with of the roots' 12185 // type, we can proceed with the narrowing. Otherwise, do nothing. 12186 if (MaxBitWidth >= TreeRootIT->getBitWidth()) 12187 return; 12188 12189 // If we can truncate the root, we must collect additional values that might 12190 // be demoted as a result. That is, those seeded by truncations we will 12191 // modify. 12192 while (!Roots.empty()) 12193 collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); 12194 12195 // Finally, map the values we can demote to the maximum bit with we computed. 12196 for (auto *Scalar : ToDemote) 12197 MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); 12198 } 12199 12200 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 12201 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 12202 auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 12203 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 12204 auto *AA = &AM.getResult<AAManager>(F); 12205 auto *LI = &AM.getResult<LoopAnalysis>(F); 12206 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 12207 auto *AC = &AM.getResult<AssumptionAnalysis>(F); 12208 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); 12209 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 12210 12211 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 12212 if (!Changed) 12213 return PreservedAnalyses::all(); 12214 12215 PreservedAnalyses PA; 12216 PA.preserveSet<CFGAnalyses>(); 12217 return PA; 12218 } 12219 12220 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, 12221 TargetTransformInfo *TTI_, 12222 TargetLibraryInfo *TLI_, AAResults *AA_, 12223 LoopInfo *LI_, DominatorTree *DT_, 12224 AssumptionCache *AC_, DemandedBits *DB_, 12225 OptimizationRemarkEmitter *ORE_) { 12226 if (!RunSLPVectorization) 12227 return false; 12228 SE = SE_; 12229 TTI = TTI_; 12230 TLI = TLI_; 12231 AA = AA_; 12232 LI = LI_; 12233 DT = DT_; 12234 AC = AC_; 12235 DB = DB_; 12236 DL = &F.getParent()->getDataLayout(); 12237 12238 Stores.clear(); 12239 GEPs.clear(); 12240 bool Changed = false; 12241 12242 // If the target claims to have no vector registers don't attempt 12243 // vectorization. 12244 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { 12245 LLVM_DEBUG( 12246 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); 12247 return false; 12248 } 12249 12250 // Don't vectorize when the attribute NoImplicitFloat is used. 12251 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 12252 return false; 12253 12254 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 12255 12256 // Use the bottom up slp vectorizer to construct chains that start with 12257 // store instructions. 12258 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); 12259 12260 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 12261 // delete instructions. 12262 12263 // Update DFS numbers now so that we can use them for ordering. 12264 DT->updateDFSNumbers(); 12265 12266 // Scan the blocks in the function in post order. 12267 for (auto *BB : post_order(&F.getEntryBlock())) { 12268 // Start new block - clear the list of reduction roots. 12269 R.clearReductionData(); 12270 collectSeedInstructions(BB); 12271 12272 // Vectorize trees that end at stores. 12273 if (!Stores.empty()) { 12274 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() 12275 << " underlying objects.\n"); 12276 Changed |= vectorizeStoreChains(R); 12277 } 12278 12279 // Vectorize trees that end at reductions. 12280 Changed |= vectorizeChainsInBlock(BB, R); 12281 12282 // Vectorize the index computations of getelementptr instructions. This 12283 // is primarily intended to catch gather-like idioms ending at 12284 // non-consecutive loads. 12285 if (!GEPs.empty()) { 12286 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() 12287 << " underlying objects.\n"); 12288 Changed |= vectorizeGEPIndices(BB, R); 12289 } 12290 } 12291 12292 if (Changed) { 12293 R.optimizeGatherSequence(); 12294 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 12295 } 12296 return Changed; 12297 } 12298 12299 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, 12300 unsigned Idx, unsigned MinVF) { 12301 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() 12302 << "\n"); 12303 const unsigned Sz = R.getVectorElementSize(Chain[0]); 12304 unsigned VF = Chain.size(); 12305 12306 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) 12307 return false; 12308 12309 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx 12310 << "\n"); 12311 12312 R.buildTree(Chain); 12313 if (R.isTreeTinyAndNotFullyVectorizable()) 12314 return false; 12315 if (R.isLoadCombineCandidate()) 12316 return false; 12317 R.reorderTopToBottom(); 12318 R.reorderBottomToTop(); 12319 R.buildExternalUses(); 12320 12321 R.computeMinimumValueSizes(); 12322 12323 InstructionCost Cost = R.getTreeCost(); 12324 12325 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); 12326 if (Cost < -SLPCostThreshold) { 12327 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); 12328 12329 using namespace ore; 12330 12331 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", 12332 cast<StoreInst>(Chain[0])) 12333 << "Stores SLP vectorized with cost " << NV("Cost", Cost) 12334 << " and with tree size " 12335 << NV("TreeSize", R.getTreeSize())); 12336 12337 R.vectorizeTree(); 12338 return true; 12339 } 12340 12341 return false; 12342 } 12343 12344 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, 12345 BoUpSLP &R) { 12346 // We may run into multiple chains that merge into a single chain. We mark the 12347 // stores that we vectorized so that we don't visit the same store twice. 12348 BoUpSLP::ValueSet VectorizedStores; 12349 bool Changed = false; 12350 12351 int E = Stores.size(); 12352 SmallBitVector Tails(E, false); 12353 int MaxIter = MaxStoreLookup.getValue(); 12354 SmallVector<std::pair<int, int>, 16> ConsecutiveChain( 12355 E, std::make_pair(E, INT_MAX)); 12356 SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false)); 12357 int IterCnt; 12358 auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, 12359 &CheckedPairs, 12360 &ConsecutiveChain](int K, int Idx) { 12361 if (IterCnt >= MaxIter) 12362 return true; 12363 if (CheckedPairs[Idx].test(K)) 12364 return ConsecutiveChain[K].second == 1 && 12365 ConsecutiveChain[K].first == Idx; 12366 ++IterCnt; 12367 CheckedPairs[Idx].set(K); 12368 CheckedPairs[K].set(Idx); 12369 std::optional<int> Diff = getPointersDiff( 12370 Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), 12371 Stores[Idx]->getValueOperand()->getType(), 12372 Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); 12373 if (!Diff || *Diff == 0) 12374 return false; 12375 int Val = *Diff; 12376 if (Val < 0) { 12377 if (ConsecutiveChain[Idx].second > -Val) { 12378 Tails.set(K); 12379 ConsecutiveChain[Idx] = std::make_pair(K, -Val); 12380 } 12381 return false; 12382 } 12383 if (ConsecutiveChain[K].second <= Val) 12384 return false; 12385 12386 Tails.set(Idx); 12387 ConsecutiveChain[K] = std::make_pair(Idx, Val); 12388 return Val == 1; 12389 }; 12390 // Do a quadratic search on all of the given stores in reverse order and find 12391 // all of the pairs of stores that follow each other. 12392 for (int Idx = E - 1; Idx >= 0; --Idx) { 12393 // If a store has multiple consecutive store candidates, search according 12394 // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... 12395 // This is because usually pairing with immediate succeeding or preceding 12396 // candidate create the best chance to find slp vectorization opportunity. 12397 const int MaxLookDepth = std::max(E - Idx, Idx + 1); 12398 IterCnt = 0; 12399 for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) 12400 if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || 12401 (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) 12402 break; 12403 } 12404 12405 // Tracks if we tried to vectorize stores starting from the given tail 12406 // already. 12407 SmallBitVector TriedTails(E, false); 12408 // For stores that start but don't end a link in the chain: 12409 for (int Cnt = E; Cnt > 0; --Cnt) { 12410 int I = Cnt - 1; 12411 if (ConsecutiveChain[I].first == E || Tails.test(I)) 12412 continue; 12413 // We found a store instr that starts a chain. Now follow the chain and try 12414 // to vectorize it. 12415 BoUpSLP::ValueList Operands; 12416 // Collect the chain into a list. 12417 while (I != E && !VectorizedStores.count(Stores[I])) { 12418 Operands.push_back(Stores[I]); 12419 Tails.set(I); 12420 if (ConsecutiveChain[I].second != 1) { 12421 // Mark the new end in the chain and go back, if required. It might be 12422 // required if the original stores come in reversed order, for example. 12423 if (ConsecutiveChain[I].first != E && 12424 Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && 12425 !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { 12426 TriedTails.set(I); 12427 Tails.reset(ConsecutiveChain[I].first); 12428 if (Cnt < ConsecutiveChain[I].first + 2) 12429 Cnt = ConsecutiveChain[I].first + 2; 12430 } 12431 break; 12432 } 12433 // Move to the next value in the chain. 12434 I = ConsecutiveChain[I].first; 12435 } 12436 assert(!Operands.empty() && "Expected non-empty list of stores."); 12437 12438 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 12439 unsigned EltSize = R.getVectorElementSize(Operands[0]); 12440 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); 12441 12442 unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), 12443 MaxElts); 12444 auto *Store = cast<StoreInst>(Operands[0]); 12445 Type *StoreTy = Store->getValueOperand()->getType(); 12446 Type *ValueTy = StoreTy; 12447 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) 12448 ValueTy = Trunc->getSrcTy(); 12449 unsigned MinVF = TTI->getStoreMinimumVF( 12450 R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); 12451 12452 if (MaxVF <= MinVF) { 12453 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " 12454 << "MinVF (" << MinVF << ")\n"); 12455 } 12456 12457 // FIXME: Is division-by-2 the correct step? Should we assert that the 12458 // register size is a power-of-2? 12459 unsigned StartIdx = 0; 12460 for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { 12461 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { 12462 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); 12463 if (!VectorizedStores.count(Slice.front()) && 12464 !VectorizedStores.count(Slice.back()) && 12465 vectorizeStoreChain(Slice, R, Cnt, MinVF)) { 12466 // Mark the vectorized stores so that we don't vectorize them again. 12467 VectorizedStores.insert(Slice.begin(), Slice.end()); 12468 Changed = true; 12469 // If we vectorized initial block, no need to try to vectorize it 12470 // again. 12471 if (Cnt == StartIdx) 12472 StartIdx += Size; 12473 Cnt += Size; 12474 continue; 12475 } 12476 ++Cnt; 12477 } 12478 // Check if the whole array was vectorized already - exit. 12479 if (StartIdx >= Operands.size()) 12480 break; 12481 } 12482 } 12483 12484 return Changed; 12485 } 12486 12487 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { 12488 // Initialize the collections. We will make a single pass over the block. 12489 Stores.clear(); 12490 GEPs.clear(); 12491 12492 // Visit the store and getelementptr instructions in BB and organize them in 12493 // Stores and GEPs according to the underlying objects of their pointer 12494 // operands. 12495 for (Instruction &I : *BB) { 12496 // Ignore store instructions that are volatile or have a pointer operand 12497 // that doesn't point to a scalar type. 12498 if (auto *SI = dyn_cast<StoreInst>(&I)) { 12499 if (!SI->isSimple()) 12500 continue; 12501 if (!isValidElementType(SI->getValueOperand()->getType())) 12502 continue; 12503 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); 12504 } 12505 12506 // Ignore getelementptr instructions that have more than one index, a 12507 // constant index, or a pointer operand that doesn't point to a scalar 12508 // type. 12509 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 12510 auto Idx = GEP->idx_begin()->get(); 12511 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) 12512 continue; 12513 if (!isValidElementType(Idx->getType())) 12514 continue; 12515 if (GEP->getType()->isVectorTy()) 12516 continue; 12517 GEPs[GEP->getPointerOperand()].push_back(GEP); 12518 } 12519 } 12520 } 12521 12522 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 12523 bool MaxVFOnly) { 12524 if (VL.size() < 2) 12525 return false; 12526 12527 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " 12528 << VL.size() << ".\n"); 12529 12530 // Check that all of the parts are instructions of the same type, 12531 // we permit an alternate opcode via InstructionsState. 12532 InstructionsState S = getSameOpcode(VL, *TLI); 12533 if (!S.getOpcode()) 12534 return false; 12535 12536 Instruction *I0 = cast<Instruction>(S.OpValue); 12537 // Make sure invalid types (including vector type) are rejected before 12538 // determining vectorization factor for scalar instructions. 12539 for (Value *V : VL) { 12540 Type *Ty = V->getType(); 12541 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { 12542 // NOTE: the following will give user internal llvm type name, which may 12543 // not be useful. 12544 R.getORE()->emit([&]() { 12545 std::string type_str; 12546 llvm::raw_string_ostream rso(type_str); 12547 Ty->print(rso); 12548 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) 12549 << "Cannot SLP vectorize list: type " 12550 << rso.str() + " is unsupported by vectorizer"; 12551 }); 12552 return false; 12553 } 12554 } 12555 12556 unsigned Sz = R.getVectorElementSize(I0); 12557 unsigned MinVF = R.getMinVF(Sz); 12558 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF); 12559 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); 12560 if (MaxVF < 2) { 12561 R.getORE()->emit([&]() { 12562 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) 12563 << "Cannot SLP vectorize list: vectorization factor " 12564 << "less than 2 is not supported"; 12565 }); 12566 return false; 12567 } 12568 12569 bool Changed = false; 12570 bool CandidateFound = false; 12571 InstructionCost MinCost = SLPCostThreshold.getValue(); 12572 Type *ScalarTy = VL[0]->getType(); 12573 if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 12574 ScalarTy = IE->getOperand(1)->getType(); 12575 12576 unsigned NextInst = 0, MaxInst = VL.size(); 12577 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { 12578 // No actual vectorization should happen, if number of parts is the same as 12579 // provided vectorization factor (i.e. the scalar type is used for vector 12580 // code during codegen). 12581 auto *VecTy = FixedVectorType::get(ScalarTy, VF); 12582 if (TTI->getNumberOfParts(VecTy) == VF) 12583 continue; 12584 for (unsigned I = NextInst; I < MaxInst; ++I) { 12585 unsigned ActualVF = std::min(MaxInst - I, VF); 12586 12587 if (!isPowerOf2_32(ActualVF)) 12588 continue; 12589 12590 if (MaxVFOnly && ActualVF < MaxVF) 12591 break; 12592 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2)) 12593 break; 12594 12595 ArrayRef<Value *> Ops = VL.slice(I, ActualVF); 12596 // Check that a previous iteration of this loop did not delete the Value. 12597 if (llvm::any_of(Ops, [&R](Value *V) { 12598 auto *I = dyn_cast<Instruction>(V); 12599 return I && R.isDeleted(I); 12600 })) 12601 continue; 12602 12603 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " 12604 << "\n"); 12605 12606 R.buildTree(Ops); 12607 if (R.isTreeTinyAndNotFullyVectorizable()) 12608 continue; 12609 R.reorderTopToBottom(); 12610 R.reorderBottomToTop( 12611 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) && 12612 !R.doesRootHaveInTreeUses()); 12613 R.buildExternalUses(); 12614 12615 R.computeMinimumValueSizes(); 12616 InstructionCost Cost = R.getTreeCost(); 12617 CandidateFound = true; 12618 MinCost = std::min(MinCost, Cost); 12619 12620 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost 12621 << " for VF=" << ActualVF << "\n"); 12622 if (Cost < -SLPCostThreshold) { 12623 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 12624 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", 12625 cast<Instruction>(Ops[0])) 12626 << "SLP vectorized with cost " << ore::NV("Cost", Cost) 12627 << " and with tree size " 12628 << ore::NV("TreeSize", R.getTreeSize())); 12629 12630 R.vectorizeTree(); 12631 // Move to the next bundle. 12632 I += VF - 1; 12633 NextInst = I + 1; 12634 Changed = true; 12635 } 12636 } 12637 } 12638 12639 if (!Changed && CandidateFound) { 12640 R.getORE()->emit([&]() { 12641 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) 12642 << "List vectorization was possible but not beneficial with cost " 12643 << ore::NV("Cost", MinCost) << " >= " 12644 << ore::NV("Treshold", -SLPCostThreshold); 12645 }); 12646 } else if (!Changed) { 12647 R.getORE()->emit([&]() { 12648 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) 12649 << "Cannot SLP vectorize list: vectorization was impossible" 12650 << " with available vectorization factors"; 12651 }); 12652 } 12653 return Changed; 12654 } 12655 12656 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { 12657 if (!I) 12658 return false; 12659 12660 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) 12661 return false; 12662 12663 Value *P = I->getParent(); 12664 12665 // Vectorize in current basic block only. 12666 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 12667 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 12668 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) 12669 return false; 12670 12671 // First collect all possible candidates 12672 SmallVector<std::pair<Value *, Value *>, 4> Candidates; 12673 Candidates.emplace_back(Op0, Op1); 12674 12675 auto *A = dyn_cast<BinaryOperator>(Op0); 12676 auto *B = dyn_cast<BinaryOperator>(Op1); 12677 // Try to skip B. 12678 if (A && B && B->hasOneUse()) { 12679 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 12680 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 12681 if (B0 && B0->getParent() == P) 12682 Candidates.emplace_back(A, B0); 12683 if (B1 && B1->getParent() == P) 12684 Candidates.emplace_back(A, B1); 12685 } 12686 // Try to skip A. 12687 if (B && A && A->hasOneUse()) { 12688 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 12689 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 12690 if (A0 && A0->getParent() == P) 12691 Candidates.emplace_back(A0, B); 12692 if (A1 && A1->getParent() == P) 12693 Candidates.emplace_back(A1, B); 12694 } 12695 12696 if (Candidates.size() == 1) 12697 return tryToVectorizeList({Op0, Op1}, R); 12698 12699 // We have multiple options. Try to pick the single best. 12700 std::optional<int> BestCandidate = R.findBestRootPair(Candidates); 12701 if (!BestCandidate) 12702 return false; 12703 return tryToVectorizeList( 12704 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); 12705 } 12706 12707 namespace { 12708 12709 /// Model horizontal reductions. 12710 /// 12711 /// A horizontal reduction is a tree of reduction instructions that has values 12712 /// that can be put into a vector as its leaves. For example: 12713 /// 12714 /// mul mul mul mul 12715 /// \ / \ / 12716 /// + + 12717 /// \ / 12718 /// + 12719 /// This tree has "mul" as its leaf values and "+" as its reduction 12720 /// instructions. A reduction can feed into a store or a binary operation 12721 /// feeding a phi. 12722 /// ... 12723 /// \ / 12724 /// + 12725 /// | 12726 /// phi += 12727 /// 12728 /// Or: 12729 /// ... 12730 /// \ / 12731 /// + 12732 /// | 12733 /// *p = 12734 /// 12735 class HorizontalReduction { 12736 using ReductionOpsType = SmallVector<Value *, 16>; 12737 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; 12738 ReductionOpsListType ReductionOps; 12739 /// List of possibly reduced values. 12740 SmallVector<SmallVector<Value *>> ReducedVals; 12741 /// Maps reduced value to the corresponding reduction operation. 12742 DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps; 12743 // Use map vector to make stable output. 12744 MapVector<Instruction *, Value *> ExtraArgs; 12745 WeakTrackingVH ReductionRoot; 12746 /// The type of reduction operation. 12747 RecurKind RdxKind; 12748 /// Checks if the optimization of original scalar identity operations on 12749 /// matched horizontal reductions is enabled and allowed. 12750 bool IsSupportedHorRdxIdentityOp = false; 12751 12752 static bool isCmpSelMinMax(Instruction *I) { 12753 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && 12754 RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); 12755 } 12756 12757 // And/or are potentially poison-safe logical patterns like: 12758 // select x, y, false 12759 // select x, true, y 12760 static bool isBoolLogicOp(Instruction *I) { 12761 return isa<SelectInst>(I) && 12762 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr())); 12763 } 12764 12765 /// Checks if instruction is associative and can be vectorized. 12766 static bool isVectorizable(RecurKind Kind, Instruction *I) { 12767 if (Kind == RecurKind::None) 12768 return false; 12769 12770 // Integer ops that map to select instructions or intrinsics are fine. 12771 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || 12772 isBoolLogicOp(I)) 12773 return true; 12774 12775 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { 12776 // FP min/max are associative except for NaN and -0.0. We do not 12777 // have to rule out -0.0 here because the intrinsic semantics do not 12778 // specify a fixed result for it. 12779 return I->getFastMathFlags().noNaNs(); 12780 } 12781 12782 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) 12783 return true; 12784 12785 return I->isAssociative(); 12786 } 12787 12788 static Value *getRdxOperand(Instruction *I, unsigned Index) { 12789 // Poison-safe 'or' takes the form: select X, true, Y 12790 // To make that work with the normal operand processing, we skip the 12791 // true value operand. 12792 // TODO: Change the code and data structures to handle this without a hack. 12793 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) 12794 return I->getOperand(2); 12795 return I->getOperand(Index); 12796 } 12797 12798 /// Creates reduction operation with the current opcode. 12799 static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, 12800 Value *RHS, const Twine &Name, bool UseSelect) { 12801 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 12802 bool IsConstant = isConstant(LHS) && isConstant(RHS); 12803 switch (Kind) { 12804 case RecurKind::Or: 12805 if (UseSelect && 12806 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 12807 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); 12808 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 12809 Name); 12810 case RecurKind::And: 12811 if (UseSelect && 12812 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 12813 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); 12814 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 12815 Name); 12816 case RecurKind::Add: 12817 case RecurKind::Mul: 12818 case RecurKind::Xor: 12819 case RecurKind::FAdd: 12820 case RecurKind::FMul: 12821 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 12822 Name); 12823 case RecurKind::FMax: 12824 if (IsConstant) 12825 return ConstantFP::get(LHS->getType(), 12826 maxnum(cast<ConstantFP>(LHS)->getValueAPF(), 12827 cast<ConstantFP>(RHS)->getValueAPF())); 12828 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); 12829 case RecurKind::FMin: 12830 if (IsConstant) 12831 return ConstantFP::get(LHS->getType(), 12832 minnum(cast<ConstantFP>(LHS)->getValueAPF(), 12833 cast<ConstantFP>(RHS)->getValueAPF())); 12834 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); 12835 case RecurKind::FMaximum: 12836 if (IsConstant) 12837 return ConstantFP::get(LHS->getType(), 12838 maximum(cast<ConstantFP>(LHS)->getValueAPF(), 12839 cast<ConstantFP>(RHS)->getValueAPF())); 12840 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS); 12841 case RecurKind::FMinimum: 12842 if (IsConstant) 12843 return ConstantFP::get(LHS->getType(), 12844 minimum(cast<ConstantFP>(LHS)->getValueAPF(), 12845 cast<ConstantFP>(RHS)->getValueAPF())); 12846 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS); 12847 case RecurKind::SMax: 12848 if (IsConstant || UseSelect) { 12849 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); 12850 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 12851 } 12852 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); 12853 case RecurKind::SMin: 12854 if (IsConstant || UseSelect) { 12855 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); 12856 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 12857 } 12858 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); 12859 case RecurKind::UMax: 12860 if (IsConstant || UseSelect) { 12861 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); 12862 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 12863 } 12864 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); 12865 case RecurKind::UMin: 12866 if (IsConstant || UseSelect) { 12867 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); 12868 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 12869 } 12870 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); 12871 default: 12872 llvm_unreachable("Unknown reduction operation."); 12873 } 12874 } 12875 12876 /// Creates reduction operation with the current opcode with the IR flags 12877 /// from \p ReductionOps, dropping nuw/nsw flags. 12878 static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, 12879 Value *RHS, const Twine &Name, 12880 const ReductionOpsListType &ReductionOps) { 12881 bool UseSelect = ReductionOps.size() == 2 || 12882 // Logical or/and. 12883 (ReductionOps.size() == 1 && 12884 isa<SelectInst>(ReductionOps.front().front())); 12885 assert((!UseSelect || ReductionOps.size() != 2 || 12886 isa<SelectInst>(ReductionOps[1][0])) && 12887 "Expected cmp + select pairs for reduction"); 12888 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); 12889 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 12890 if (auto *Sel = dyn_cast<SelectInst>(Op)) { 12891 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr, 12892 /*IncludeWrapFlags=*/false); 12893 propagateIRFlags(Op, ReductionOps[1], nullptr, 12894 /*IncludeWrapFlags=*/false); 12895 return Op; 12896 } 12897 } 12898 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false); 12899 return Op; 12900 } 12901 12902 public: 12903 static RecurKind getRdxKind(Value *V) { 12904 auto *I = dyn_cast<Instruction>(V); 12905 if (!I) 12906 return RecurKind::None; 12907 if (match(I, m_Add(m_Value(), m_Value()))) 12908 return RecurKind::Add; 12909 if (match(I, m_Mul(m_Value(), m_Value()))) 12910 return RecurKind::Mul; 12911 if (match(I, m_And(m_Value(), m_Value())) || 12912 match(I, m_LogicalAnd(m_Value(), m_Value()))) 12913 return RecurKind::And; 12914 if (match(I, m_Or(m_Value(), m_Value())) || 12915 match(I, m_LogicalOr(m_Value(), m_Value()))) 12916 return RecurKind::Or; 12917 if (match(I, m_Xor(m_Value(), m_Value()))) 12918 return RecurKind::Xor; 12919 if (match(I, m_FAdd(m_Value(), m_Value()))) 12920 return RecurKind::FAdd; 12921 if (match(I, m_FMul(m_Value(), m_Value()))) 12922 return RecurKind::FMul; 12923 12924 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) 12925 return RecurKind::FMax; 12926 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) 12927 return RecurKind::FMin; 12928 12929 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value()))) 12930 return RecurKind::FMaximum; 12931 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value()))) 12932 return RecurKind::FMinimum; 12933 // This matches either cmp+select or intrinsics. SLP is expected to handle 12934 // either form. 12935 // TODO: If we are canonicalizing to intrinsics, we can remove several 12936 // special-case paths that deal with selects. 12937 if (match(I, m_SMax(m_Value(), m_Value()))) 12938 return RecurKind::SMax; 12939 if (match(I, m_SMin(m_Value(), m_Value()))) 12940 return RecurKind::SMin; 12941 if (match(I, m_UMax(m_Value(), m_Value()))) 12942 return RecurKind::UMax; 12943 if (match(I, m_UMin(m_Value(), m_Value()))) 12944 return RecurKind::UMin; 12945 12946 if (auto *Select = dyn_cast<SelectInst>(I)) { 12947 // Try harder: look for min/max pattern based on instructions producing 12948 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 12949 // During the intermediate stages of SLP, it's very common to have 12950 // pattern like this (since optimizeGatherSequence is run only once 12951 // at the end): 12952 // %1 = extractelement <2 x i32> %a, i32 0 12953 // %2 = extractelement <2 x i32> %a, i32 1 12954 // %cond = icmp sgt i32 %1, %2 12955 // %3 = extractelement <2 x i32> %a, i32 0 12956 // %4 = extractelement <2 x i32> %a, i32 1 12957 // %select = select i1 %cond, i32 %3, i32 %4 12958 CmpInst::Predicate Pred; 12959 Instruction *L1; 12960 Instruction *L2; 12961 12962 Value *LHS = Select->getTrueValue(); 12963 Value *RHS = Select->getFalseValue(); 12964 Value *Cond = Select->getCondition(); 12965 12966 // TODO: Support inverse predicates. 12967 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { 12968 if (!isa<ExtractElementInst>(RHS) || 12969 !L2->isIdenticalTo(cast<Instruction>(RHS))) 12970 return RecurKind::None; 12971 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { 12972 if (!isa<ExtractElementInst>(LHS) || 12973 !L1->isIdenticalTo(cast<Instruction>(LHS))) 12974 return RecurKind::None; 12975 } else { 12976 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) 12977 return RecurKind::None; 12978 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || 12979 !L1->isIdenticalTo(cast<Instruction>(LHS)) || 12980 !L2->isIdenticalTo(cast<Instruction>(RHS))) 12981 return RecurKind::None; 12982 } 12983 12984 switch (Pred) { 12985 default: 12986 return RecurKind::None; 12987 case CmpInst::ICMP_SGT: 12988 case CmpInst::ICMP_SGE: 12989 return RecurKind::SMax; 12990 case CmpInst::ICMP_SLT: 12991 case CmpInst::ICMP_SLE: 12992 return RecurKind::SMin; 12993 case CmpInst::ICMP_UGT: 12994 case CmpInst::ICMP_UGE: 12995 return RecurKind::UMax; 12996 case CmpInst::ICMP_ULT: 12997 case CmpInst::ICMP_ULE: 12998 return RecurKind::UMin; 12999 } 13000 } 13001 return RecurKind::None; 13002 } 13003 13004 /// Get the index of the first operand. 13005 static unsigned getFirstOperandIndex(Instruction *I) { 13006 return isCmpSelMinMax(I) ? 1 : 0; 13007 } 13008 13009 private: 13010 /// Total number of operands in the reduction operation. 13011 static unsigned getNumberOfOperands(Instruction *I) { 13012 return isCmpSelMinMax(I) ? 3 : 2; 13013 } 13014 13015 /// Checks if the instruction is in basic block \p BB. 13016 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 13017 static bool hasSameParent(Instruction *I, BasicBlock *BB) { 13018 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) { 13019 auto *Sel = cast<SelectInst>(I); 13020 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); 13021 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; 13022 } 13023 return I->getParent() == BB; 13024 } 13025 13026 /// Expected number of uses for reduction operations/reduced values. 13027 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { 13028 if (IsCmpSelMinMax) { 13029 // SelectInst must be used twice while the condition op must have single 13030 // use only. 13031 if (auto *Sel = dyn_cast<SelectInst>(I)) 13032 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); 13033 return I->hasNUses(2); 13034 } 13035 13036 // Arithmetic reduction operation must be used once only. 13037 return I->hasOneUse(); 13038 } 13039 13040 /// Initializes the list of reduction operations. 13041 void initReductionOps(Instruction *I) { 13042 if (isCmpSelMinMax(I)) 13043 ReductionOps.assign(2, ReductionOpsType()); 13044 else 13045 ReductionOps.assign(1, ReductionOpsType()); 13046 } 13047 13048 /// Add all reduction operations for the reduction instruction \p I. 13049 void addReductionOps(Instruction *I) { 13050 if (isCmpSelMinMax(I)) { 13051 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); 13052 ReductionOps[1].emplace_back(I); 13053 } else { 13054 ReductionOps[0].emplace_back(I); 13055 } 13056 } 13057 13058 static bool isGoodForReduction(ArrayRef<Value *> Data) { 13059 int Sz = Data.size(); 13060 auto *I = dyn_cast<Instruction>(Data.front()); 13061 return Sz > 1 || isConstant(Data.front()) || 13062 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode())); 13063 } 13064 13065 public: 13066 HorizontalReduction() = default; 13067 13068 /// Try to find a reduction tree. 13069 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, 13070 ScalarEvolution &SE, const DataLayout &DL, 13071 const TargetLibraryInfo &TLI) { 13072 RdxKind = HorizontalReduction::getRdxKind(Root); 13073 if (!isVectorizable(RdxKind, Root)) 13074 return false; 13075 13076 // Analyze "regular" integer/FP types for reductions - no target-specific 13077 // types or pointers. 13078 Type *Ty = Root->getType(); 13079 if (!isValidElementType(Ty) || Ty->isPointerTy()) 13080 return false; 13081 13082 // Though the ultimate reduction may have multiple uses, its condition must 13083 // have only single use. 13084 if (auto *Sel = dyn_cast<SelectInst>(Root)) 13085 if (!Sel->getCondition()->hasOneUse()) 13086 return false; 13087 13088 ReductionRoot = Root; 13089 13090 // Iterate through all the operands of the possible reduction tree and 13091 // gather all the reduced values, sorting them by their value id. 13092 BasicBlock *BB = Root->getParent(); 13093 bool IsCmpSelMinMax = isCmpSelMinMax(Root); 13094 SmallVector<Instruction *> Worklist(1, Root); 13095 // Checks if the operands of the \p TreeN instruction are also reduction 13096 // operations or should be treated as reduced values or an extra argument, 13097 // which is not part of the reduction. 13098 auto CheckOperands = [&](Instruction *TreeN, 13099 SmallVectorImpl<Value *> &ExtraArgs, 13100 SmallVectorImpl<Value *> &PossibleReducedVals, 13101 SmallVectorImpl<Instruction *> &ReductionOps) { 13102 for (int I = getFirstOperandIndex(TreeN), 13103 End = getNumberOfOperands(TreeN); 13104 I < End; ++I) { 13105 Value *EdgeVal = getRdxOperand(TreeN, I); 13106 ReducedValsToOps[EdgeVal].push_back(TreeN); 13107 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); 13108 // Edge has wrong parent - mark as an extra argument. 13109 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) && 13110 !hasSameParent(EdgeInst, BB)) { 13111 ExtraArgs.push_back(EdgeVal); 13112 continue; 13113 } 13114 // If the edge is not an instruction, or it is different from the main 13115 // reduction opcode or has too many uses - possible reduced value. 13116 // Also, do not try to reduce const values, if the operation is not 13117 // foldable. 13118 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || 13119 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || 13120 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || 13121 !isVectorizable(RdxKind, EdgeInst) || 13122 (R.isAnalyzedReductionRoot(EdgeInst) && 13123 all_of(EdgeInst->operands(), Constant::classof))) { 13124 PossibleReducedVals.push_back(EdgeVal); 13125 continue; 13126 } 13127 ReductionOps.push_back(EdgeInst); 13128 } 13129 }; 13130 // Try to regroup reduced values so that it gets more profitable to try to 13131 // reduce them. Values are grouped by their value ids, instructions - by 13132 // instruction op id and/or alternate op id, plus do extra analysis for 13133 // loads (grouping them by the distabce between pointers) and cmp 13134 // instructions (grouping them by the predicate). 13135 MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> 13136 PossibleReducedVals; 13137 initReductionOps(Root); 13138 DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap; 13139 SmallSet<size_t, 2> LoadKeyUsed; 13140 SmallPtrSet<Value *, 4> DoNotReverseVals; 13141 13142 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { 13143 Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); 13144 if (LoadKeyUsed.contains(Key)) { 13145 auto LIt = LoadsMap.find(Ptr); 13146 if (LIt != LoadsMap.end()) { 13147 for (LoadInst *RLI : LIt->second) { 13148 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), 13149 LI->getType(), LI->getPointerOperand(), DL, SE, 13150 /*StrictCheck=*/true)) 13151 return hash_value(RLI->getPointerOperand()); 13152 } 13153 for (LoadInst *RLI : LIt->second) { 13154 if (arePointersCompatible(RLI->getPointerOperand(), 13155 LI->getPointerOperand(), TLI)) { 13156 hash_code SubKey = hash_value(RLI->getPointerOperand()); 13157 DoNotReverseVals.insert(RLI); 13158 return SubKey; 13159 } 13160 } 13161 if (LIt->second.size() > 2) { 13162 hash_code SubKey = 13163 hash_value(LIt->second.back()->getPointerOperand()); 13164 DoNotReverseVals.insert(LIt->second.back()); 13165 return SubKey; 13166 } 13167 } 13168 } 13169 LoadKeyUsed.insert(Key); 13170 LoadsMap.try_emplace(Ptr).first->second.push_back(LI); 13171 return hash_value(LI->getPointerOperand()); 13172 }; 13173 13174 while (!Worklist.empty()) { 13175 Instruction *TreeN = Worklist.pop_back_val(); 13176 SmallVector<Value *> Args; 13177 SmallVector<Value *> PossibleRedVals; 13178 SmallVector<Instruction *> PossibleReductionOps; 13179 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps); 13180 // If too many extra args - mark the instruction itself as a reduction 13181 // value, not a reduction operation. 13182 if (Args.size() < 2) { 13183 addReductionOps(TreeN); 13184 // Add extra args. 13185 if (!Args.empty()) { 13186 assert(Args.size() == 1 && "Expected only single argument."); 13187 ExtraArgs[TreeN] = Args.front(); 13188 } 13189 // Add reduction values. The values are sorted for better vectorization 13190 // results. 13191 for (Value *V : PossibleRedVals) { 13192 size_t Key, Idx; 13193 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, 13194 /*AllowAlternate=*/false); 13195 ++PossibleReducedVals[Key][Idx] 13196 .insert(std::make_pair(V, 0)) 13197 .first->second; 13198 } 13199 Worklist.append(PossibleReductionOps.rbegin(), 13200 PossibleReductionOps.rend()); 13201 } else { 13202 size_t Key, Idx; 13203 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey, 13204 /*AllowAlternate=*/false); 13205 ++PossibleReducedVals[Key][Idx] 13206 .insert(std::make_pair(TreeN, 0)) 13207 .first->second; 13208 } 13209 } 13210 auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); 13211 // Sort values by the total number of values kinds to start the reduction 13212 // from the longest possible reduced values sequences. 13213 for (auto &PossibleReducedVals : PossibleReducedValsVect) { 13214 auto PossibleRedVals = PossibleReducedVals.second.takeVector(); 13215 SmallVector<SmallVector<Value *>> PossibleRedValsVect; 13216 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); 13217 It != E; ++It) { 13218 PossibleRedValsVect.emplace_back(); 13219 auto RedValsVect = It->second.takeVector(); 13220 stable_sort(RedValsVect, llvm::less_second()); 13221 for (const std::pair<Value *, unsigned> &Data : RedValsVect) 13222 PossibleRedValsVect.back().append(Data.second, Data.first); 13223 } 13224 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { 13225 return P1.size() > P2.size(); 13226 }); 13227 int NewIdx = -1; 13228 for (ArrayRef<Value *> Data : PossibleRedValsVect) { 13229 if (isGoodForReduction(Data) || 13230 (isa<LoadInst>(Data.front()) && NewIdx >= 0 && 13231 isa<LoadInst>(ReducedVals[NewIdx].front()) && 13232 getUnderlyingObject( 13233 cast<LoadInst>(Data.front())->getPointerOperand()) == 13234 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front()) 13235 ->getPointerOperand()))) { 13236 if (NewIdx < 0) { 13237 NewIdx = ReducedVals.size(); 13238 ReducedVals.emplace_back(); 13239 } 13240 if (DoNotReverseVals.contains(Data.front())) 13241 ReducedVals[NewIdx].append(Data.begin(), Data.end()); 13242 else 13243 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); 13244 } else { 13245 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend()); 13246 } 13247 } 13248 } 13249 // Sort the reduced values by number of same/alternate opcode and/or pointer 13250 // operand. 13251 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { 13252 return P1.size() > P2.size(); 13253 }); 13254 return true; 13255 } 13256 13257 /// Attempt to vectorize the tree found by matchAssociativeReduction. 13258 Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, 13259 const TargetLibraryInfo &TLI) { 13260 constexpr int ReductionLimit = 4; 13261 constexpr unsigned RegMaxNumber = 4; 13262 constexpr unsigned RedValsMaxNumber = 128; 13263 // If there are a sufficient number of reduction values, reduce 13264 // to a nearby power-of-2. We can safely generate oversized 13265 // vectors and rely on the backend to split them to legal sizes. 13266 unsigned NumReducedVals = 13267 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0, 13268 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned { 13269 if (!isGoodForReduction(Vals)) 13270 return Num; 13271 return Num + Vals.size(); 13272 }); 13273 if (NumReducedVals < ReductionLimit && 13274 (!AllowHorRdxIdenityOptimization || 13275 all_of(ReducedVals, [](ArrayRef<Value *> RedV) { 13276 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV); 13277 }))) { 13278 for (ReductionOpsType &RdxOps : ReductionOps) 13279 for (Value *RdxOp : RdxOps) 13280 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 13281 return nullptr; 13282 } 13283 13284 IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); 13285 13286 // Track the reduced values in case if they are replaced by extractelement 13287 // because of the vectorization. 13288 DenseMap<Value *, WeakTrackingVH> TrackedVals( 13289 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size()); 13290 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; 13291 SmallVector<std::pair<Value *, Value *>> ReplacedExternals; 13292 ExternallyUsedValues.reserve(ExtraArgs.size() + 1); 13293 // The same extra argument may be used several times, so log each attempt 13294 // to use it. 13295 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { 13296 assert(Pair.first && "DebugLoc must be set."); 13297 ExternallyUsedValues[Pair.second].push_back(Pair.first); 13298 TrackedVals.try_emplace(Pair.second, Pair.second); 13299 } 13300 13301 // The compare instruction of a min/max is the insertion point for new 13302 // instructions and may be replaced with a new compare instruction. 13303 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { 13304 assert(isa<SelectInst>(RdxRootInst) && 13305 "Expected min/max reduction to have select root instruction"); 13306 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); 13307 assert(isa<Instruction>(ScalarCond) && 13308 "Expected min/max reduction to have compare condition"); 13309 return cast<Instruction>(ScalarCond); 13310 }; 13311 13312 // Return new VectorizedTree, based on previous value. 13313 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { 13314 if (VectorizedTree) { 13315 // Update the final value in the reduction. 13316 Builder.SetCurrentDebugLocation( 13317 cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); 13318 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", 13319 ReductionOps); 13320 } 13321 // Initialize the final value in the reduction. 13322 return Res; 13323 }; 13324 // The reduction root is used as the insertion point for new instructions, 13325 // so set it as externally used to prevent it from being deleted. 13326 ExternallyUsedValues[ReductionRoot]; 13327 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * 13328 ReductionOps.front().size()); 13329 for (ReductionOpsType &RdxOps : ReductionOps) 13330 for (Value *RdxOp : RdxOps) { 13331 if (!RdxOp) 13332 continue; 13333 IgnoreList.insert(RdxOp); 13334 } 13335 // Intersect the fast-math-flags from all reduction operations. 13336 FastMathFlags RdxFMF; 13337 RdxFMF.set(); 13338 for (Value *U : IgnoreList) 13339 if (auto *FPMO = dyn_cast<FPMathOperator>(U)) 13340 RdxFMF &= FPMO->getFastMathFlags(); 13341 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); 13342 13343 // Need to track reduced vals, they may be changed during vectorization of 13344 // subvectors. 13345 for (ArrayRef<Value *> Candidates : ReducedVals) 13346 for (Value *V : Candidates) 13347 TrackedVals.try_emplace(V, V); 13348 13349 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size()); 13350 // List of the values that were reduced in other trees as part of gather 13351 // nodes and thus requiring extract if fully vectorized in other trees. 13352 SmallPtrSet<Value *, 4> RequiredExtract; 13353 Value *VectorizedTree = nullptr; 13354 bool CheckForReusedReductionOps = false; 13355 // Try to vectorize elements based on their type. 13356 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { 13357 ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; 13358 InstructionsState S = getSameOpcode(OrigReducedVals, TLI); 13359 SmallVector<Value *> Candidates; 13360 Candidates.reserve(2 * OrigReducedVals.size()); 13361 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size()); 13362 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { 13363 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; 13364 // Check if the reduction value was not overriden by the extractelement 13365 // instruction because of the vectorization and exclude it, if it is not 13366 // compatible with other values. 13367 if (auto *Inst = dyn_cast<Instruction>(RdxVal)) 13368 if (isVectorLikeInstWithConstOps(Inst) && 13369 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) 13370 continue; 13371 Candidates.push_back(RdxVal); 13372 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); 13373 } 13374 bool ShuffledExtracts = false; 13375 // Try to handle shuffled extractelements. 13376 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && 13377 I + 1 < E) { 13378 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI); 13379 if (NextS.getOpcode() == Instruction::ExtractElement && 13380 !NextS.isAltShuffle()) { 13381 SmallVector<Value *> CommonCandidates(Candidates); 13382 for (Value *RV : ReducedVals[I + 1]) { 13383 Value *RdxVal = TrackedVals.find(RV)->second; 13384 // Check if the reduction value was not overriden by the 13385 // extractelement instruction because of the vectorization and 13386 // exclude it, if it is not compatible with other values. 13387 if (auto *Inst = dyn_cast<Instruction>(RdxVal)) 13388 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst)) 13389 continue; 13390 CommonCandidates.push_back(RdxVal); 13391 TrackedToOrig.try_emplace(RdxVal, RV); 13392 } 13393 SmallVector<int> Mask; 13394 if (isFixedVectorShuffle(CommonCandidates, Mask)) { 13395 ++I; 13396 Candidates.swap(CommonCandidates); 13397 ShuffledExtracts = true; 13398 } 13399 } 13400 } 13401 13402 // Emit code for constant values. 13403 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 && 13404 allConstant(Candidates)) { 13405 Value *Res = Candidates.front(); 13406 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); 13407 for (Value *VC : ArrayRef(Candidates).drop_front()) { 13408 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps); 13409 ++VectorizedVals.try_emplace(VC, 0).first->getSecond(); 13410 if (auto *ResI = dyn_cast<Instruction>(Res)) 13411 V.analyzedReductionRoot(ResI); 13412 } 13413 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res); 13414 continue; 13415 } 13416 13417 unsigned NumReducedVals = Candidates.size(); 13418 if (NumReducedVals < ReductionLimit && 13419 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization || 13420 !isSplat(Candidates))) 13421 continue; 13422 13423 // Check if we support repeated scalar values processing (optimization of 13424 // original scalar identity operations on matched horizontal reductions). 13425 IsSupportedHorRdxIdentityOp = 13426 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul && 13427 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd; 13428 // Gather same values. 13429 MapVector<Value *, unsigned> SameValuesCounter; 13430 if (IsSupportedHorRdxIdentityOp) 13431 for (Value *V : Candidates) 13432 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; 13433 // Used to check if the reduced values used same number of times. In this 13434 // case the compiler may produce better code. E.g. if reduced values are 13435 // aabbccdd (8 x values), then the first node of the tree will have a node 13436 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. 13437 // Plus, the final reduction will be performed on <8 x aabbccdd>. 13438 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4 13439 // x abcd) * 2. 13440 // Currently it only handles add/fadd/xor. and/or/min/max do not require 13441 // this analysis, other operations may require an extra estimation of 13442 // the profitability. 13443 bool SameScaleFactor = false; 13444 bool OptReusedScalars = IsSupportedHorRdxIdentityOp && 13445 SameValuesCounter.size() != Candidates.size(); 13446 if (OptReusedScalars) { 13447 SameScaleFactor = 13448 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || 13449 RdxKind == RecurKind::Xor) && 13450 all_of(drop_begin(SameValuesCounter), 13451 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { 13452 return P.second == SameValuesCounter.front().second; 13453 }); 13454 Candidates.resize(SameValuesCounter.size()); 13455 transform(SameValuesCounter, Candidates.begin(), 13456 [](const auto &P) { return P.first; }); 13457 NumReducedVals = Candidates.size(); 13458 // Have a reduction of the same element. 13459 if (NumReducedVals == 1) { 13460 Value *OrigV = TrackedToOrig.find(Candidates.front())->second; 13461 unsigned Cnt = SameValuesCounter.lookup(OrigV); 13462 Value *RedVal = 13463 emitScaleForReusedOps(Candidates.front(), Builder, Cnt); 13464 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 13465 VectorizedVals.try_emplace(OrigV, Cnt); 13466 continue; 13467 } 13468 } 13469 13470 unsigned MaxVecRegSize = V.getMaxVecRegSize(); 13471 unsigned EltSize = V.getVectorElementSize(Candidates[0]); 13472 unsigned MaxElts = 13473 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize); 13474 13475 unsigned ReduxWidth = std::min<unsigned>( 13476 llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); 13477 unsigned Start = 0; 13478 unsigned Pos = Start; 13479 // Restarts vectorization attempt with lower vector factor. 13480 unsigned PrevReduxWidth = ReduxWidth; 13481 bool CheckForReusedReductionOpsLocal = false; 13482 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, 13483 &CheckForReusedReductionOpsLocal, 13484 &PrevReduxWidth, &V, 13485 &IgnoreList](bool IgnoreVL = false) { 13486 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); 13487 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { 13488 // Check if any of the reduction ops are gathered. If so, worth 13489 // trying again with less number of reduction ops. 13490 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; 13491 } 13492 ++Pos; 13493 if (Pos < NumReducedVals - ReduxWidth + 1) 13494 return IsAnyRedOpGathered; 13495 Pos = Start; 13496 ReduxWidth /= 2; 13497 return IsAnyRedOpGathered; 13498 }; 13499 bool AnyVectorized = false; 13500 while (Pos < NumReducedVals - ReduxWidth + 1 && 13501 ReduxWidth >= ReductionLimit) { 13502 // Dependency in tree of the reduction ops - drop this attempt, try 13503 // later. 13504 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && 13505 Start == 0) { 13506 CheckForReusedReductionOps = true; 13507 break; 13508 } 13509 PrevReduxWidth = ReduxWidth; 13510 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth); 13511 // Beeing analyzed already - skip. 13512 if (V.areAnalyzedReductionVals(VL)) { 13513 (void)AdjustReducedVals(/*IgnoreVL=*/true); 13514 continue; 13515 } 13516 // Early exit if any of the reduction values were deleted during 13517 // previous vectorization attempts. 13518 if (any_of(VL, [&V](Value *RedVal) { 13519 auto *RedValI = dyn_cast<Instruction>(RedVal); 13520 if (!RedValI) 13521 return false; 13522 return V.isDeleted(RedValI); 13523 })) 13524 break; 13525 V.buildTree(VL, IgnoreList); 13526 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { 13527 if (!AdjustReducedVals()) 13528 V.analyzedReductionVals(VL); 13529 continue; 13530 } 13531 if (V.isLoadCombineReductionCandidate(RdxKind)) { 13532 if (!AdjustReducedVals()) 13533 V.analyzedReductionVals(VL); 13534 continue; 13535 } 13536 V.reorderTopToBottom(); 13537 // No need to reorder the root node at all. 13538 V.reorderBottomToTop(/*IgnoreReorder=*/true); 13539 // Keep extracted other reduction values, if they are used in the 13540 // vectorization trees. 13541 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( 13542 ExternallyUsedValues); 13543 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { 13544 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) 13545 continue; 13546 for_each(ReducedVals[Cnt], 13547 [&LocalExternallyUsedValues, &TrackedVals](Value *V) { 13548 if (isa<Instruction>(V)) 13549 LocalExternallyUsedValues[TrackedVals[V]]; 13550 }); 13551 } 13552 if (!IsSupportedHorRdxIdentityOp) { 13553 // Number of uses of the candidates in the vector of values. 13554 assert(SameValuesCounter.empty() && 13555 "Reused values counter map is not empty"); 13556 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 13557 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 13558 continue; 13559 Value *V = Candidates[Cnt]; 13560 Value *OrigV = TrackedToOrig.find(V)->second; 13561 ++SameValuesCounter[OrigV]; 13562 } 13563 } 13564 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end()); 13565 // Gather externally used values. 13566 SmallPtrSet<Value *, 4> Visited; 13567 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 13568 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 13569 continue; 13570 Value *RdxVal = Candidates[Cnt]; 13571 if (!Visited.insert(RdxVal).second) 13572 continue; 13573 // Check if the scalar was vectorized as part of the vectorization 13574 // tree but not the top node. 13575 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { 13576 LocalExternallyUsedValues[RdxVal]; 13577 continue; 13578 } 13579 Value *OrigV = TrackedToOrig.find(RdxVal)->second; 13580 unsigned NumOps = 13581 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; 13582 if (NumOps != ReducedValsToOps.find(OrigV)->second.size()) 13583 LocalExternallyUsedValues[RdxVal]; 13584 } 13585 // Do not need the list of reused scalars in regular mode anymore. 13586 if (!IsSupportedHorRdxIdentityOp) 13587 SameValuesCounter.clear(); 13588 for (Value *RdxVal : VL) 13589 if (RequiredExtract.contains(RdxVal)) 13590 LocalExternallyUsedValues[RdxVal]; 13591 // Update LocalExternallyUsedValues for the scalar, replaced by 13592 // extractelement instructions. 13593 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) { 13594 auto It = ExternallyUsedValues.find(Pair.first); 13595 if (It == ExternallyUsedValues.end()) 13596 continue; 13597 LocalExternallyUsedValues[Pair.second].append(It->second); 13598 } 13599 V.buildExternalUses(LocalExternallyUsedValues); 13600 13601 V.computeMinimumValueSizes(); 13602 13603 // Estimate cost. 13604 InstructionCost TreeCost = V.getTreeCost(VL); 13605 InstructionCost ReductionCost = 13606 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); 13607 InstructionCost Cost = TreeCost + ReductionCost; 13608 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); 13609 if (!Cost.isValid()) 13610 return nullptr; 13611 if (Cost >= -SLPCostThreshold) { 13612 V.getORE()->emit([&]() { 13613 return OptimizationRemarkMissed( 13614 SV_NAME, "HorSLPNotBeneficial", 13615 ReducedValsToOps.find(VL[0])->second.front()) 13616 << "Vectorizing horizontal reduction is possible " 13617 << "but not beneficial with cost " << ore::NV("Cost", Cost) 13618 << " and threshold " 13619 << ore::NV("Threshold", -SLPCostThreshold); 13620 }); 13621 if (!AdjustReducedVals()) 13622 V.analyzedReductionVals(VL); 13623 continue; 13624 } 13625 13626 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" 13627 << Cost << ". (HorRdx)\n"); 13628 V.getORE()->emit([&]() { 13629 return OptimizationRemark( 13630 SV_NAME, "VectorizedHorizontalReduction", 13631 ReducedValsToOps.find(VL[0])->second.front()) 13632 << "Vectorized horizontal reduction with cost " 13633 << ore::NV("Cost", Cost) << " and with tree size " 13634 << ore::NV("TreeSize", V.getTreeSize()); 13635 }); 13636 13637 Builder.setFastMathFlags(RdxFMF); 13638 13639 // Emit a reduction. If the root is a select (min/max idiom), the insert 13640 // point is the compare condition of that select. 13641 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); 13642 Instruction *InsertPt = RdxRootInst; 13643 if (IsCmpSelMinMax) 13644 InsertPt = GetCmpForMinMaxReduction(RdxRootInst); 13645 13646 // Vectorize a tree. 13647 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, 13648 ReplacedExternals, InsertPt); 13649 13650 Builder.SetInsertPoint(InsertPt); 13651 13652 // To prevent poison from leaking across what used to be sequential, 13653 // safe, scalar boolean logic operations, the reduction operand must be 13654 // frozen. 13655 if (isBoolLogicOp(RdxRootInst)) 13656 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); 13657 13658 // Emit code to correctly handle reused reduced values, if required. 13659 if (OptReusedScalars && !SameScaleFactor) { 13660 VectorizedRoot = 13661 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(), 13662 SameValuesCounter, TrackedToOrig); 13663 } 13664 13665 Value *ReducedSubTree = 13666 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); 13667 13668 // Improved analysis for add/fadd/xor reductions with same scale factor 13669 // for all operands of reductions. We can emit scalar ops for them 13670 // instead. 13671 if (OptReusedScalars && SameScaleFactor) 13672 ReducedSubTree = emitScaleForReusedOps( 13673 ReducedSubTree, Builder, SameValuesCounter.front().second); 13674 13675 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); 13676 // Count vectorized reduced values to exclude them from final reduction. 13677 for (Value *RdxVal : VL) { 13678 Value *OrigV = TrackedToOrig.find(RdxVal)->second; 13679 if (IsSupportedHorRdxIdentityOp) { 13680 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]); 13681 continue; 13682 } 13683 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); 13684 if (!V.isVectorized(RdxVal)) 13685 RequiredExtract.insert(RdxVal); 13686 } 13687 Pos += ReduxWidth; 13688 Start = Pos; 13689 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); 13690 AnyVectorized = true; 13691 } 13692 if (OptReusedScalars && !AnyVectorized) { 13693 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { 13694 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second); 13695 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 13696 Value *OrigV = TrackedToOrig.find(P.first)->second; 13697 VectorizedVals.try_emplace(OrigV, P.second); 13698 } 13699 continue; 13700 } 13701 } 13702 if (VectorizedTree) { 13703 // Reorder operands of bool logical op in the natural order to avoid 13704 // possible problem with poison propagation. If not possible to reorder 13705 // (both operands are originally RHS), emit an extra freeze instruction 13706 // for the LHS operand. 13707 // I.e., if we have original code like this: 13708 // RedOp1 = select i1 ?, i1 LHS, i1 false 13709 // RedOp2 = select i1 RHS, i1 ?, i1 false 13710 13711 // Then, we swap LHS/RHS to create a new op that matches the poison 13712 // semantics of the original code. 13713 13714 // If we have original code like this and both values could be poison: 13715 // RedOp1 = select i1 ?, i1 LHS, i1 false 13716 // RedOp2 = select i1 ?, i1 RHS, i1 false 13717 13718 // Then, we must freeze LHS in the new op. 13719 auto &&FixBoolLogicalOps = 13720 [&Builder, VectorizedTree](Value *&LHS, Value *&RHS, 13721 Instruction *RedOp1, Instruction *RedOp2) { 13722 if (!isBoolLogicOp(RedOp1)) 13723 return; 13724 if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS || 13725 isGuaranteedNotToBePoison(LHS)) 13726 return; 13727 if (!isBoolLogicOp(RedOp2)) 13728 return; 13729 if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS || 13730 isGuaranteedNotToBePoison(RHS)) { 13731 std::swap(LHS, RHS); 13732 return; 13733 } 13734 LHS = Builder.CreateFreeze(LHS); 13735 }; 13736 // Finish the reduction. 13737 // Need to add extra arguments and not vectorized possible reduction 13738 // values. 13739 // Try to avoid dependencies between the scalar remainders after 13740 // reductions. 13741 auto &&FinalGen = 13742 [this, &Builder, &TrackedVals, &FixBoolLogicalOps]( 13743 ArrayRef<std::pair<Instruction *, Value *>> InstVals) { 13744 unsigned Sz = InstVals.size(); 13745 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + 13746 Sz % 2); 13747 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { 13748 Instruction *RedOp = InstVals[I + 1].first; 13749 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); 13750 Value *RdxVal1 = InstVals[I].second; 13751 Value *StableRdxVal1 = RdxVal1; 13752 auto It1 = TrackedVals.find(RdxVal1); 13753 if (It1 != TrackedVals.end()) 13754 StableRdxVal1 = It1->second; 13755 Value *RdxVal2 = InstVals[I + 1].second; 13756 Value *StableRdxVal2 = RdxVal2; 13757 auto It2 = TrackedVals.find(RdxVal2); 13758 if (It2 != TrackedVals.end()) 13759 StableRdxVal2 = It2->second; 13760 // To prevent poison from leaking across what used to be 13761 // sequential, safe, scalar boolean logic operations, the 13762 // reduction operand must be frozen. 13763 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, 13764 RedOp); 13765 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, 13766 StableRdxVal2, "op.rdx", ReductionOps); 13767 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); 13768 } 13769 if (Sz % 2 == 1) 13770 ExtraReds[Sz / 2] = InstVals.back(); 13771 return ExtraReds; 13772 }; 13773 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions; 13774 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot), 13775 VectorizedTree); 13776 SmallPtrSet<Value *, 8> Visited; 13777 for (ArrayRef<Value *> Candidates : ReducedVals) { 13778 for (Value *RdxVal : Candidates) { 13779 if (!Visited.insert(RdxVal).second) 13780 continue; 13781 unsigned NumOps = VectorizedVals.lookup(RdxVal); 13782 for (Instruction *RedOp : 13783 ArrayRef(ReducedValsToOps.find(RdxVal)->second) 13784 .drop_back(NumOps)) 13785 ExtraReductions.emplace_back(RedOp, RdxVal); 13786 } 13787 } 13788 for (auto &Pair : ExternallyUsedValues) { 13789 // Add each externally used value to the final reduction. 13790 for (auto *I : Pair.second) 13791 ExtraReductions.emplace_back(I, Pair.first); 13792 } 13793 // Iterate through all not-vectorized reduction values/extra arguments. 13794 while (ExtraReductions.size() > 1) { 13795 VectorizedTree = ExtraReductions.front().second; 13796 SmallVector<std::pair<Instruction *, Value *>> NewReds = 13797 FinalGen(ExtraReductions); 13798 ExtraReductions.swap(NewReds); 13799 } 13800 VectorizedTree = ExtraReductions.front().second; 13801 13802 ReductionRoot->replaceAllUsesWith(VectorizedTree); 13803 13804 // The original scalar reduction is expected to have no remaining 13805 // uses outside the reduction tree itself. Assert that we got this 13806 // correct, replace internal uses with undef, and mark for eventual 13807 // deletion. 13808 #ifndef NDEBUG 13809 SmallSet<Value *, 4> IgnoreSet; 13810 for (ArrayRef<Value *> RdxOps : ReductionOps) 13811 IgnoreSet.insert(RdxOps.begin(), RdxOps.end()); 13812 #endif 13813 for (ArrayRef<Value *> RdxOps : ReductionOps) { 13814 for (Value *Ignore : RdxOps) { 13815 if (!Ignore) 13816 continue; 13817 #ifndef NDEBUG 13818 for (auto *U : Ignore->users()) { 13819 assert(IgnoreSet.count(U) && 13820 "All users must be either in the reduction ops list."); 13821 } 13822 #endif 13823 if (!Ignore->use_empty()) { 13824 Value *Undef = UndefValue::get(Ignore->getType()); 13825 Ignore->replaceAllUsesWith(Undef); 13826 } 13827 V.eraseInstruction(cast<Instruction>(Ignore)); 13828 } 13829 } 13830 } else if (!CheckForReusedReductionOps) { 13831 for (ReductionOpsType &RdxOps : ReductionOps) 13832 for (Value *RdxOp : RdxOps) 13833 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 13834 } 13835 return VectorizedTree; 13836 } 13837 13838 private: 13839 /// Calculate the cost of a reduction. 13840 InstructionCost getReductionCost(TargetTransformInfo *TTI, 13841 ArrayRef<Value *> ReducedVals, 13842 bool IsCmpSelMinMax, unsigned ReduxWidth, 13843 FastMathFlags FMF) { 13844 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 13845 Value *FirstReducedVal = ReducedVals.front(); 13846 Type *ScalarTy = FirstReducedVal->getType(); 13847 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); 13848 InstructionCost VectorCost = 0, ScalarCost; 13849 // If all of the reduced values are constant, the vector cost is 0, since 13850 // the reduction value can be calculated at the compile time. 13851 bool AllConsts = allConstant(ReducedVals); 13852 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) { 13853 InstructionCost Cost = 0; 13854 // Scalar cost is repeated for N-1 elements. 13855 int Cnt = ReducedVals.size(); 13856 for (Value *RdxVal : ReducedVals) { 13857 if (Cnt == 1) 13858 break; 13859 --Cnt; 13860 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) { 13861 Cost += GenCostFn(); 13862 continue; 13863 } 13864 InstructionCost ScalarCost = 0; 13865 for (User *U : RdxVal->users()) { 13866 auto *RdxOp = cast<Instruction>(U); 13867 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { 13868 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); 13869 continue; 13870 } 13871 ScalarCost = InstructionCost::getInvalid(); 13872 break; 13873 } 13874 if (ScalarCost.isValid()) 13875 Cost += ScalarCost; 13876 else 13877 Cost += GenCostFn(); 13878 } 13879 return Cost; 13880 }; 13881 switch (RdxKind) { 13882 case RecurKind::Add: 13883 case RecurKind::Mul: 13884 case RecurKind::Or: 13885 case RecurKind::And: 13886 case RecurKind::Xor: 13887 case RecurKind::FAdd: 13888 case RecurKind::FMul: { 13889 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); 13890 if (!AllConsts) 13891 VectorCost = 13892 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); 13893 ScalarCost = EvaluateScalarCost([&]() { 13894 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); 13895 }); 13896 break; 13897 } 13898 case RecurKind::FMax: 13899 case RecurKind::FMin: 13900 case RecurKind::FMaximum: 13901 case RecurKind::FMinimum: 13902 case RecurKind::SMax: 13903 case RecurKind::SMin: 13904 case RecurKind::UMax: 13905 case RecurKind::UMin: { 13906 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 13907 if (!AllConsts) 13908 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); 13909 ScalarCost = EvaluateScalarCost([&]() { 13910 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); 13911 return TTI->getIntrinsicInstrCost(ICA, CostKind); 13912 }); 13913 break; 13914 } 13915 default: 13916 llvm_unreachable("Expected arithmetic or min/max reduction operation"); 13917 } 13918 13919 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost 13920 << " for reduction that starts with " << *FirstReducedVal 13921 << " (It is a splitting reduction)\n"); 13922 return VectorCost - ScalarCost; 13923 } 13924 13925 /// Emit a horizontal reduction of the vectorized value. 13926 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, 13927 unsigned ReduxWidth, const TargetTransformInfo *TTI) { 13928 assert(VectorizedValue && "Need to have a vectorized tree node"); 13929 assert(isPowerOf2_32(ReduxWidth) && 13930 "We only handle power-of-two reductions for now"); 13931 assert(RdxKind != RecurKind::FMulAdd && 13932 "A call to the llvm.fmuladd intrinsic is not handled yet"); 13933 13934 ++NumVectorInstructions; 13935 return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); 13936 } 13937 13938 /// Emits optimized code for unique scalar value reused \p Cnt times. 13939 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, 13940 unsigned Cnt) { 13941 assert(IsSupportedHorRdxIdentityOp && 13942 "The optimization of matched scalar identity horizontal reductions " 13943 "must be supported."); 13944 switch (RdxKind) { 13945 case RecurKind::Add: { 13946 // res = mul vv, n 13947 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); 13948 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " 13949 << VectorizedValue << ". (HorRdx)\n"); 13950 return Builder.CreateMul(VectorizedValue, Scale); 13951 } 13952 case RecurKind::Xor: { 13953 // res = n % 2 ? 0 : vv 13954 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue 13955 << ". (HorRdx)\n"); 13956 if (Cnt % 2 == 0) 13957 return Constant::getNullValue(VectorizedValue->getType()); 13958 return VectorizedValue; 13959 } 13960 case RecurKind::FAdd: { 13961 // res = fmul v, n 13962 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); 13963 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " 13964 << VectorizedValue << ". (HorRdx)\n"); 13965 return Builder.CreateFMul(VectorizedValue, Scale); 13966 } 13967 case RecurKind::And: 13968 case RecurKind::Or: 13969 case RecurKind::SMax: 13970 case RecurKind::SMin: 13971 case RecurKind::UMax: 13972 case RecurKind::UMin: 13973 case RecurKind::FMax: 13974 case RecurKind::FMin: 13975 case RecurKind::FMaximum: 13976 case RecurKind::FMinimum: 13977 // res = vv 13978 return VectorizedValue; 13979 case RecurKind::Mul: 13980 case RecurKind::FMul: 13981 case RecurKind::FMulAdd: 13982 case RecurKind::SelectICmp: 13983 case RecurKind::SelectFCmp: 13984 case RecurKind::None: 13985 llvm_unreachable("Unexpected reduction kind for repeated scalar."); 13986 } 13987 return nullptr; 13988 } 13989 13990 /// Emits actual operation for the scalar identity values, found during 13991 /// horizontal reduction analysis. 13992 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, 13993 ArrayRef<Value *> VL, 13994 const MapVector<Value *, unsigned> &SameValuesCounter, 13995 const DenseMap<Value *, Value *> &TrackedToOrig) { 13996 assert(IsSupportedHorRdxIdentityOp && 13997 "The optimization of matched scalar identity horizontal reductions " 13998 "must be supported."); 13999 switch (RdxKind) { 14000 case RecurKind::Add: { 14001 // root = mul prev_root, <1, 1, n, 1> 14002 SmallVector<Constant *> Vals; 14003 for (Value *V : VL) { 14004 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 14005 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); 14006 } 14007 auto *Scale = ConstantVector::get(Vals); 14008 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " 14009 << VectorizedValue << ". (HorRdx)\n"); 14010 return Builder.CreateMul(VectorizedValue, Scale); 14011 } 14012 case RecurKind::And: 14013 case RecurKind::Or: 14014 // No need for multiple or/and(s). 14015 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue 14016 << ". (HorRdx)\n"); 14017 return VectorizedValue; 14018 case RecurKind::SMax: 14019 case RecurKind::SMin: 14020 case RecurKind::UMax: 14021 case RecurKind::UMin: 14022 case RecurKind::FMax: 14023 case RecurKind::FMin: 14024 case RecurKind::FMaximum: 14025 case RecurKind::FMinimum: 14026 // No need for multiple min/max(s) of the same value. 14027 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue 14028 << ". (HorRdx)\n"); 14029 return VectorizedValue; 14030 case RecurKind::Xor: { 14031 // Replace values with even number of repeats with 0, since 14032 // x xor x = 0. 14033 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, 14034 // 7>, if elements 4th and 6th elements have even number of repeats. 14035 SmallVector<int> Mask( 14036 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(), 14037 PoisonMaskElem); 14038 std::iota(Mask.begin(), Mask.end(), 0); 14039 bool NeedShuffle = false; 14040 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { 14041 Value *V = VL[I]; 14042 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 14043 if (Cnt % 2 == 0) { 14044 Mask[I] = VF; 14045 NeedShuffle = true; 14046 } 14047 } 14048 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I 14049 : Mask) dbgs() 14050 << I << " "; 14051 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n"); 14052 if (NeedShuffle) 14053 VectorizedValue = Builder.CreateShuffleVector( 14054 VectorizedValue, 14055 ConstantVector::getNullValue(VectorizedValue->getType()), Mask); 14056 return VectorizedValue; 14057 } 14058 case RecurKind::FAdd: { 14059 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> 14060 SmallVector<Constant *> Vals; 14061 for (Value *V : VL) { 14062 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 14063 Vals.push_back(ConstantFP::get(V->getType(), Cnt)); 14064 } 14065 auto *Scale = ConstantVector::get(Vals); 14066 return Builder.CreateFMul(VectorizedValue, Scale); 14067 } 14068 case RecurKind::Mul: 14069 case RecurKind::FMul: 14070 case RecurKind::FMulAdd: 14071 case RecurKind::SelectICmp: 14072 case RecurKind::SelectFCmp: 14073 case RecurKind::None: 14074 llvm_unreachable("Unexpected reduction kind for reused scalars."); 14075 } 14076 return nullptr; 14077 } 14078 }; 14079 } // end anonymous namespace 14080 14081 static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { 14082 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) 14083 return cast<FixedVectorType>(IE->getType())->getNumElements(); 14084 14085 unsigned AggregateSize = 1; 14086 auto *IV = cast<InsertValueInst>(InsertInst); 14087 Type *CurrentType = IV->getType(); 14088 do { 14089 if (auto *ST = dyn_cast<StructType>(CurrentType)) { 14090 for (auto *Elt : ST->elements()) 14091 if (Elt != ST->getElementType(0)) // check homogeneity 14092 return std::nullopt; 14093 AggregateSize *= ST->getNumElements(); 14094 CurrentType = ST->getElementType(0); 14095 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 14096 AggregateSize *= AT->getNumElements(); 14097 CurrentType = AT->getElementType(); 14098 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { 14099 AggregateSize *= VT->getNumElements(); 14100 return AggregateSize; 14101 } else if (CurrentType->isSingleValueType()) { 14102 return AggregateSize; 14103 } else { 14104 return std::nullopt; 14105 } 14106 } while (true); 14107 } 14108 14109 static void findBuildAggregate_rec(Instruction *LastInsertInst, 14110 TargetTransformInfo *TTI, 14111 SmallVectorImpl<Value *> &BuildVectorOpds, 14112 SmallVectorImpl<Value *> &InsertElts, 14113 unsigned OperandOffset) { 14114 do { 14115 Value *InsertedOperand = LastInsertInst->getOperand(1); 14116 std::optional<unsigned> OperandIndex = 14117 getInsertIndex(LastInsertInst, OperandOffset); 14118 if (!OperandIndex) 14119 return; 14120 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) { 14121 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, 14122 BuildVectorOpds, InsertElts, *OperandIndex); 14123 14124 } else { 14125 BuildVectorOpds[*OperandIndex] = InsertedOperand; 14126 InsertElts[*OperandIndex] = LastInsertInst; 14127 } 14128 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); 14129 } while (LastInsertInst != nullptr && 14130 isa<InsertValueInst, InsertElementInst>(LastInsertInst) && 14131 LastInsertInst->hasOneUse()); 14132 } 14133 14134 /// Recognize construction of vectors like 14135 /// %ra = insertelement <4 x float> poison, float %s0, i32 0 14136 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 14137 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 14138 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 14139 /// starting from the last insertelement or insertvalue instruction. 14140 /// 14141 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 14142 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 14143 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 14144 /// 14145 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 14146 /// 14147 /// \return true if it matches. 14148 static bool findBuildAggregate(Instruction *LastInsertInst, 14149 TargetTransformInfo *TTI, 14150 SmallVectorImpl<Value *> &BuildVectorOpds, 14151 SmallVectorImpl<Value *> &InsertElts) { 14152 14153 assert((isa<InsertElementInst>(LastInsertInst) || 14154 isa<InsertValueInst>(LastInsertInst)) && 14155 "Expected insertelement or insertvalue instruction!"); 14156 14157 assert((BuildVectorOpds.empty() && InsertElts.empty()) && 14158 "Expected empty result vectors!"); 14159 14160 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); 14161 if (!AggregateSize) 14162 return false; 14163 BuildVectorOpds.resize(*AggregateSize); 14164 InsertElts.resize(*AggregateSize); 14165 14166 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0); 14167 llvm::erase_value(BuildVectorOpds, nullptr); 14168 llvm::erase_value(InsertElts, nullptr); 14169 if (BuildVectorOpds.size() >= 2) 14170 return true; 14171 14172 return false; 14173 } 14174 14175 /// Try and get a reduction instruction from a phi node. 14176 /// 14177 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 14178 /// if they come from either \p ParentBB or a containing loop latch. 14179 /// 14180 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 14181 /// if not possible. 14182 static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P, 14183 BasicBlock *ParentBB, LoopInfo *LI) { 14184 // There are situations where the reduction value is not dominated by the 14185 // reduction phi. Vectorizing such cases has been reported to cause 14186 // miscompiles. See PR25787. 14187 auto DominatedReduxValue = [&](Value *R) { 14188 return isa<Instruction>(R) && 14189 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); 14190 }; 14191 14192 Instruction *Rdx = nullptr; 14193 14194 // Return the incoming value if it comes from the same BB as the phi node. 14195 if (P->getIncomingBlock(0) == ParentBB) { 14196 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 14197 } else if (P->getIncomingBlock(1) == ParentBB) { 14198 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 14199 } 14200 14201 if (Rdx && DominatedReduxValue(Rdx)) 14202 return Rdx; 14203 14204 // Otherwise, check whether we have a loop latch to look at. 14205 Loop *BBL = LI->getLoopFor(ParentBB); 14206 if (!BBL) 14207 return nullptr; 14208 BasicBlock *BBLatch = BBL->getLoopLatch(); 14209 if (!BBLatch) 14210 return nullptr; 14211 14212 // There is a loop latch, return the incoming value if it comes from 14213 // that. This reduction pattern occasionally turns up. 14214 if (P->getIncomingBlock(0) == BBLatch) { 14215 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 14216 } else if (P->getIncomingBlock(1) == BBLatch) { 14217 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 14218 } 14219 14220 if (Rdx && DominatedReduxValue(Rdx)) 14221 return Rdx; 14222 14223 return nullptr; 14224 } 14225 14226 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { 14227 if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) 14228 return true; 14229 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) 14230 return true; 14231 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) 14232 return true; 14233 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1)))) 14234 return true; 14235 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1)))) 14236 return true; 14237 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) 14238 return true; 14239 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) 14240 return true; 14241 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) 14242 return true; 14243 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) 14244 return true; 14245 return false; 14246 } 14247 14248 /// We could have an initial reduction that is not an add. 14249 /// r *= v1 + v2 + v3 + v4 14250 /// In such a case start looking for a tree rooted in the first '+'. 14251 /// \Returns the new root if found, which may be nullptr if not an instruction. 14252 static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi, 14253 Instruction *Root) { 14254 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) || 14255 isa<IntrinsicInst>(Root)) && 14256 "Expected binop, select, or intrinsic for reduction matching"); 14257 Value *LHS = 14258 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root)); 14259 Value *RHS = 14260 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1); 14261 if (LHS == Phi) 14262 return dyn_cast<Instruction>(RHS); 14263 if (RHS == Phi) 14264 return dyn_cast<Instruction>(LHS); 14265 return nullptr; 14266 } 14267 14268 /// \p Returns the first operand of \p I that does not match \p Phi. If 14269 /// operand is not an instruction it returns nullptr. 14270 static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) { 14271 Value *Op0 = nullptr; 14272 Value *Op1 = nullptr; 14273 if (!matchRdxBop(I, Op0, Op1)) 14274 return nullptr; 14275 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0); 14276 } 14277 14278 /// \Returns true if \p I is a candidate instruction for reduction vectorization. 14279 static bool isReductionCandidate(Instruction *I) { 14280 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value())); 14281 Value *B0 = nullptr, *B1 = nullptr; 14282 bool IsBinop = matchRdxBop(I, B0, B1); 14283 return IsBinop || IsSelect; 14284 } 14285 14286 bool SLPVectorizerPass::vectorizeHorReduction( 14287 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, 14288 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { 14289 if (!ShouldVectorizeHor) 14290 return false; 14291 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root); 14292 14293 if (Root->getParent() != BB || isa<PHINode>(Root)) 14294 return false; 14295 14296 // If we can find a secondary reduction root, use that instead. 14297 auto SelectRoot = [&]() { 14298 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) && 14299 HorizontalReduction::getRdxKind(Root) != RecurKind::None) 14300 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root)) 14301 return NewRoot; 14302 return Root; 14303 }; 14304 14305 // Start analysis starting from Root instruction. If horizontal reduction is 14306 // found, try to vectorize it. If it is not a horizontal reduction or 14307 // vectorization is not possible or not effective, and currently analyzed 14308 // instruction is a binary operation, try to vectorize the operands, using 14309 // pre-order DFS traversal order. If the operands were not vectorized, repeat 14310 // the same procedure considering each operand as a possible root of the 14311 // horizontal reduction. 14312 // Interrupt the process if the Root instruction itself was vectorized or all 14313 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 14314 // If a horizintal reduction was not matched or vectorized we collect 14315 // instructions for possible later attempts for vectorization. 14316 std::queue<std::pair<Instruction *, unsigned>> Stack; 14317 Stack.emplace(SelectRoot(), 0); 14318 SmallPtrSet<Value *, 8> VisitedInstrs; 14319 bool Res = false; 14320 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * { 14321 if (R.isAnalyzedReductionRoot(Inst)) 14322 return nullptr; 14323 if (!isReductionCandidate(Inst)) 14324 return nullptr; 14325 HorizontalReduction HorRdx; 14326 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) 14327 return nullptr; 14328 return HorRdx.tryToReduce(R, TTI, *TLI); 14329 }; 14330 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { 14331 if (TryOperandsAsNewSeeds && FutureSeed == Root) { 14332 FutureSeed = getNonPhiOperand(Root, P); 14333 if (!FutureSeed) 14334 return false; 14335 } 14336 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their 14337 // analysis is done separately. 14338 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed)) 14339 PostponedInsts.push_back(FutureSeed); 14340 return true; 14341 }; 14342 14343 while (!Stack.empty()) { 14344 Instruction *Inst; 14345 unsigned Level; 14346 std::tie(Inst, Level) = Stack.front(); 14347 Stack.pop(); 14348 // Do not try to analyze instruction that has already been vectorized. 14349 // This may happen when we vectorize instruction operands on a previous 14350 // iteration while stack was populated before that happened. 14351 if (R.isDeleted(Inst)) 14352 continue; 14353 if (Value *VectorizedV = TryToReduce(Inst)) { 14354 Res = true; 14355 if (auto *I = dyn_cast<Instruction>(VectorizedV)) { 14356 // Try to find another reduction. 14357 Stack.emplace(I, Level); 14358 continue; 14359 } 14360 } else { 14361 // We could not vectorize `Inst` so try to use it as a future seed. 14362 if (!TryAppendToPostponedInsts(Inst)) { 14363 assert(Stack.empty() && "Expected empty stack"); 14364 break; 14365 } 14366 } 14367 14368 // Try to vectorize operands. 14369 // Continue analysis for the instruction from the same basic block only to 14370 // save compile time. 14371 if (++Level < RecursionMaxDepth) 14372 for (auto *Op : Inst->operand_values()) 14373 if (VisitedInstrs.insert(Op).second) 14374 if (auto *I = dyn_cast<Instruction>(Op)) 14375 // Do not try to vectorize CmpInst operands, this is done 14376 // separately. 14377 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) && 14378 !R.isDeleted(I) && I->getParent() == BB) 14379 Stack.emplace(I, Level); 14380 } 14381 return Res; 14382 } 14383 14384 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, 14385 BasicBlock *BB, BoUpSLP &R, 14386 TargetTransformInfo *TTI) { 14387 SmallVector<WeakTrackingVH> PostponedInsts; 14388 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts); 14389 Res |= tryToVectorize(PostponedInsts, R); 14390 return Res; 14391 } 14392 14393 bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, 14394 BoUpSLP &R) { 14395 bool Res = false; 14396 for (Value *V : Insts) 14397 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst)) 14398 Res |= tryToVectorize(Inst, R); 14399 return Res; 14400 } 14401 14402 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, 14403 BasicBlock *BB, BoUpSLP &R) { 14404 const DataLayout &DL = BB->getModule()->getDataLayout(); 14405 if (!R.canMapToVector(IVI->getType(), DL)) 14406 return false; 14407 14408 SmallVector<Value *, 16> BuildVectorOpds; 14409 SmallVector<Value *, 16> BuildVectorInsts; 14410 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) 14411 return false; 14412 14413 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); 14414 // Aggregate value is unlikely to be processed in vector register. 14415 return tryToVectorizeList(BuildVectorOpds, R); 14416 } 14417 14418 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, 14419 BasicBlock *BB, BoUpSLP &R) { 14420 SmallVector<Value *, 16> BuildVectorInsts; 14421 SmallVector<Value *, 16> BuildVectorOpds; 14422 SmallVector<int> Mask; 14423 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || 14424 (llvm::all_of( 14425 BuildVectorOpds, 14426 [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && 14427 isFixedVectorShuffle(BuildVectorOpds, Mask))) 14428 return false; 14429 14430 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); 14431 return tryToVectorizeList(BuildVectorInsts, R); 14432 } 14433 14434 template <typename T> 14435 static bool tryToVectorizeSequence( 14436 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, 14437 function_ref<bool(T *, T *)> AreCompatible, 14438 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, 14439 bool MaxVFOnly, BoUpSLP &R) { 14440 bool Changed = false; 14441 // Sort by type, parent, operands. 14442 stable_sort(Incoming, Comparator); 14443 14444 // Try to vectorize elements base on their type. 14445 SmallVector<T *> Candidates; 14446 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { 14447 // Look for the next elements with the same type, parent and operand 14448 // kinds. 14449 auto *SameTypeIt = IncIt; 14450 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) 14451 ++SameTypeIt; 14452 14453 // Try to vectorize them. 14454 unsigned NumElts = (SameTypeIt - IncIt); 14455 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" 14456 << NumElts << ")\n"); 14457 // The vectorization is a 3-state attempt: 14458 // 1. Try to vectorize instructions with the same/alternate opcodes with the 14459 // size of maximal register at first. 14460 // 2. Try to vectorize remaining instructions with the same type, if 14461 // possible. This may result in the better vectorization results rather than 14462 // if we try just to vectorize instructions with the same/alternate opcodes. 14463 // 3. Final attempt to try to vectorize all instructions with the 14464 // same/alternate ops only, this may result in some extra final 14465 // vectorization. 14466 if (NumElts > 1 && 14467 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) { 14468 // Success start over because instructions might have been changed. 14469 Changed = true; 14470 } else { 14471 /// \Returns the minimum number of elements that we will attempt to 14472 /// vectorize. 14473 auto GetMinNumElements = [&R](Value *V) { 14474 unsigned EltSize = R.getVectorElementSize(V); 14475 return std::max(2U, R.getMaxVecRegSize() / EltSize); 14476 }; 14477 if (NumElts < GetMinNumElements(*IncIt) && 14478 (Candidates.empty() || 14479 Candidates.front()->getType() == (*IncIt)->getType())) { 14480 Candidates.append(IncIt, std::next(IncIt, NumElts)); 14481 } 14482 } 14483 // Final attempt to vectorize instructions with the same types. 14484 if (Candidates.size() > 1 && 14485 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { 14486 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { 14487 // Success start over because instructions might have been changed. 14488 Changed = true; 14489 } else if (MaxVFOnly) { 14490 // Try to vectorize using small vectors. 14491 for (auto *It = Candidates.begin(), *End = Candidates.end(); 14492 It != End;) { 14493 auto *SameTypeIt = It; 14494 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) 14495 ++SameTypeIt; 14496 unsigned NumElts = (SameTypeIt - It); 14497 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts), 14498 /*MaxVFOnly=*/false)) 14499 Changed = true; 14500 It = SameTypeIt; 14501 } 14502 } 14503 Candidates.clear(); 14504 } 14505 14506 // Start over at the next instruction of a different type (or the end). 14507 IncIt = SameTypeIt; 14508 } 14509 return Changed; 14510 } 14511 14512 /// Compare two cmp instructions. If IsCompatibility is true, function returns 14513 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding 14514 /// operands. If IsCompatibility is false, function implements strict weak 14515 /// ordering relation between two cmp instructions, returning true if the first 14516 /// instruction is "less" than the second, i.e. its predicate is less than the 14517 /// predicate of the second or the operands IDs are less than the operands IDs 14518 /// of the second cmp instruction. 14519 template <bool IsCompatibility> 14520 static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, 14521 const DominatorTree &DT) { 14522 assert(isValidElementType(V->getType()) && 14523 isValidElementType(V2->getType()) && 14524 "Expected valid element types only."); 14525 auto *CI1 = cast<CmpInst>(V); 14526 auto *CI2 = cast<CmpInst>(V2); 14527 if (CI1->getOperand(0)->getType()->getTypeID() < 14528 CI2->getOperand(0)->getType()->getTypeID()) 14529 return !IsCompatibility; 14530 if (CI1->getOperand(0)->getType()->getTypeID() > 14531 CI2->getOperand(0)->getType()->getTypeID()) 14532 return false; 14533 CmpInst::Predicate Pred1 = CI1->getPredicate(); 14534 CmpInst::Predicate Pred2 = CI2->getPredicate(); 14535 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); 14536 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); 14537 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); 14538 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); 14539 if (BasePred1 < BasePred2) 14540 return !IsCompatibility; 14541 if (BasePred1 > BasePred2) 14542 return false; 14543 // Compare operands. 14544 bool LEPreds = Pred1 <= Pred2; 14545 bool GEPreds = Pred1 >= Pred2; 14546 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { 14547 auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); 14548 auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); 14549 if (Op1->getValueID() < Op2->getValueID()) 14550 return !IsCompatibility; 14551 if (Op1->getValueID() > Op2->getValueID()) 14552 return false; 14553 if (auto *I1 = dyn_cast<Instruction>(Op1)) 14554 if (auto *I2 = dyn_cast<Instruction>(Op2)) { 14555 if (IsCompatibility) { 14556 if (I1->getParent() != I2->getParent()) 14557 return false; 14558 } else { 14559 // Try to compare nodes with same parent. 14560 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent()); 14561 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent()); 14562 if (!NodeI1) 14563 return NodeI2 != nullptr; 14564 if (!NodeI2) 14565 return false; 14566 assert((NodeI1 == NodeI2) == 14567 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 14568 "Different nodes should have different DFS numbers"); 14569 if (NodeI1 != NodeI2) 14570 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 14571 } 14572 InstructionsState S = getSameOpcode({I1, I2}, TLI); 14573 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) 14574 continue; 14575 return !IsCompatibility && I1->getOpcode() < I2->getOpcode(); 14576 } 14577 } 14578 return IsCompatibility; 14579 } 14580 14581 template <typename ItT> 14582 bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, 14583 BasicBlock *BB, BoUpSLP &R) { 14584 bool Changed = false; 14585 // Try to find reductions first. 14586 for (CmpInst *I : CmpInsts) { 14587 if (R.isDeleted(I)) 14588 continue; 14589 for (Value *Op : I->operands()) 14590 if (auto *RootOp = dyn_cast<Instruction>(Op)) 14591 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI); 14592 } 14593 // Try to vectorize operands as vector bundles. 14594 for (CmpInst *I : CmpInsts) { 14595 if (R.isDeleted(I)) 14596 continue; 14597 Changed |= tryToVectorize(I, R); 14598 } 14599 // Try to vectorize list of compares. 14600 // Sort by type, compare predicate, etc. 14601 auto CompareSorter = [&](Value *V, Value *V2) { 14602 if (V == V2) 14603 return false; 14604 return compareCmp<false>(V, V2, *TLI, *DT); 14605 }; 14606 14607 auto AreCompatibleCompares = [&](Value *V1, Value *V2) { 14608 if (V1 == V2) 14609 return true; 14610 return compareCmp<true>(V1, V2, *TLI, *DT); 14611 }; 14612 14613 SmallVector<Value *> Vals; 14614 for (Instruction *V : CmpInsts) 14615 if (!R.isDeleted(V) && isValidElementType(V->getType())) 14616 Vals.push_back(V); 14617 if (Vals.size() <= 1) 14618 return Changed; 14619 Changed |= tryToVectorizeSequence<Value>( 14620 Vals, CompareSorter, AreCompatibleCompares, 14621 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 14622 // Exclude possible reductions from other blocks. 14623 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) { 14624 return any_of(V->users(), [V](User *U) { 14625 auto *Select = dyn_cast<SelectInst>(U); 14626 return Select && 14627 Select->getParent() != cast<Instruction>(V)->getParent(); 14628 }); 14629 }); 14630 if (ArePossiblyReducedInOtherBlock) 14631 return false; 14632 return tryToVectorizeList(Candidates, R, MaxVFOnly); 14633 }, 14634 /*MaxVFOnly=*/true, R); 14635 return Changed; 14636 } 14637 14638 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, 14639 BasicBlock *BB, BoUpSLP &R) { 14640 assert(all_of(Instructions, 14641 [](auto *I) { 14642 return isa<InsertElementInst, InsertValueInst>(I); 14643 }) && 14644 "This function only accepts Insert instructions"); 14645 bool OpsChanged = false; 14646 SmallVector<WeakTrackingVH> PostponedInsts; 14647 // pass1 - try to vectorize reductions only 14648 for (auto *I : reverse(Instructions)) { 14649 if (R.isDeleted(I)) 14650 continue; 14651 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts); 14652 } 14653 // pass2 - try to match and vectorize a buildvector sequence. 14654 for (auto *I : reverse(Instructions)) { 14655 if (R.isDeleted(I) || isa<CmpInst>(I)) 14656 continue; 14657 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { 14658 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); 14659 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { 14660 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); 14661 } 14662 } 14663 // Now try to vectorize postponed instructions. 14664 OpsChanged |= tryToVectorize(PostponedInsts, R); 14665 14666 Instructions.clear(); 14667 return OpsChanged; 14668 } 14669 14670 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 14671 bool Changed = false; 14672 SmallVector<Value *, 4> Incoming; 14673 SmallPtrSet<Value *, 16> VisitedInstrs; 14674 // Maps phi nodes to the non-phi nodes found in the use tree for each phi 14675 // node. Allows better to identify the chains that can be vectorized in the 14676 // better way. 14677 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; 14678 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { 14679 assert(isValidElementType(V1->getType()) && 14680 isValidElementType(V2->getType()) && 14681 "Expected vectorizable types only."); 14682 // It is fine to compare type IDs here, since we expect only vectorizable 14683 // types, like ints, floats and pointers, we don't care about other type. 14684 if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) 14685 return true; 14686 if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) 14687 return false; 14688 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 14689 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 14690 if (Opcodes1.size() < Opcodes2.size()) 14691 return true; 14692 if (Opcodes1.size() > Opcodes2.size()) 14693 return false; 14694 std::optional<bool> ConstOrder; 14695 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 14696 // Undefs are compatible with any other value. 14697 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { 14698 if (!ConstOrder) 14699 ConstOrder = 14700 !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); 14701 continue; 14702 } 14703 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 14704 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 14705 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); 14706 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); 14707 if (!NodeI1) 14708 return NodeI2 != nullptr; 14709 if (!NodeI2) 14710 return false; 14711 assert((NodeI1 == NodeI2) == 14712 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 14713 "Different nodes should have different DFS numbers"); 14714 if (NodeI1 != NodeI2) 14715 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 14716 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 14717 if (S.getOpcode()) 14718 continue; 14719 return I1->getOpcode() < I2->getOpcode(); 14720 } 14721 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { 14722 if (!ConstOrder) 14723 ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); 14724 continue; 14725 } 14726 if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) 14727 return true; 14728 if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) 14729 return false; 14730 } 14731 return ConstOrder && *ConstOrder; 14732 }; 14733 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) { 14734 if (V1 == V2) 14735 return true; 14736 if (V1->getType() != V2->getType()) 14737 return false; 14738 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 14739 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 14740 if (Opcodes1.size() != Opcodes2.size()) 14741 return false; 14742 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 14743 // Undefs are compatible with any other value. 14744 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) 14745 continue; 14746 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 14747 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 14748 if (I1->getParent() != I2->getParent()) 14749 return false; 14750 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 14751 if (S.getOpcode()) 14752 continue; 14753 return false; 14754 } 14755 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) 14756 continue; 14757 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) 14758 return false; 14759 } 14760 return true; 14761 }; 14762 14763 bool HaveVectorizedPhiNodes = false; 14764 do { 14765 // Collect the incoming values from the PHIs. 14766 Incoming.clear(); 14767 for (Instruction &I : *BB) { 14768 PHINode *P = dyn_cast<PHINode>(&I); 14769 if (!P) 14770 break; 14771 14772 // No need to analyze deleted, vectorized and non-vectorizable 14773 // instructions. 14774 if (!VisitedInstrs.count(P) && !R.isDeleted(P) && 14775 isValidElementType(P->getType())) 14776 Incoming.push_back(P); 14777 } 14778 14779 // Find the corresponding non-phi nodes for better matching when trying to 14780 // build the tree. 14781 for (Value *V : Incoming) { 14782 SmallVectorImpl<Value *> &Opcodes = 14783 PHIToOpcodes.try_emplace(V).first->getSecond(); 14784 if (!Opcodes.empty()) 14785 continue; 14786 SmallVector<Value *, 4> Nodes(1, V); 14787 SmallPtrSet<Value *, 4> Visited; 14788 while (!Nodes.empty()) { 14789 auto *PHI = cast<PHINode>(Nodes.pop_back_val()); 14790 if (!Visited.insert(PHI).second) 14791 continue; 14792 for (Value *V : PHI->incoming_values()) { 14793 if (auto *PHI1 = dyn_cast<PHINode>((V))) { 14794 Nodes.push_back(PHI1); 14795 continue; 14796 } 14797 Opcodes.emplace_back(V); 14798 } 14799 } 14800 } 14801 14802 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( 14803 Incoming, PHICompare, AreCompatiblePHIs, 14804 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 14805 return tryToVectorizeList(Candidates, R, MaxVFOnly); 14806 }, 14807 /*MaxVFOnly=*/true, R); 14808 Changed |= HaveVectorizedPhiNodes; 14809 VisitedInstrs.insert(Incoming.begin(), Incoming.end()); 14810 } while (HaveVectorizedPhiNodes); 14811 14812 VisitedInstrs.clear(); 14813 14814 InstSetVector PostProcessInserts; 14815 SmallSetVector<CmpInst *, 8> PostProcessCmps; 14816 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true 14817 // also vectorizes `PostProcessCmps`. 14818 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { 14819 bool Changed = vectorizeInserts(PostProcessInserts, BB, R); 14820 if (VectorizeCmps) { 14821 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R); 14822 PostProcessCmps.clear(); 14823 } 14824 PostProcessInserts.clear(); 14825 return Changed; 14826 }; 14827 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. 14828 auto IsInPostProcessInstrs = [&](Instruction *I) { 14829 if (auto *Cmp = dyn_cast<CmpInst>(I)) 14830 return PostProcessCmps.contains(Cmp); 14831 return isa<InsertElementInst, InsertValueInst>(I) && 14832 PostProcessInserts.contains(I); 14833 }; 14834 // Returns true if `I` is an instruction without users, like terminator, or 14835 // function call with ignored return value, store. Ignore unused instructions 14836 // (basing on instruction type, except for CallInst and InvokeInst). 14837 auto HasNoUsers = [](Instruction *I) { 14838 return I->use_empty() && 14839 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I)); 14840 }; 14841 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 14842 // Skip instructions with scalable type. The num of elements is unknown at 14843 // compile-time for scalable type. 14844 if (isa<ScalableVectorType>(it->getType())) 14845 continue; 14846 14847 // Skip instructions marked for the deletion. 14848 if (R.isDeleted(&*it)) 14849 continue; 14850 // We may go through BB multiple times so skip the one we have checked. 14851 if (!VisitedInstrs.insert(&*it).second) { 14852 if (HasNoUsers(&*it) && 14853 VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) { 14854 // We would like to start over since some instructions are deleted 14855 // and the iterator may become invalid value. 14856 Changed = true; 14857 it = BB->begin(); 14858 e = BB->end(); 14859 } 14860 continue; 14861 } 14862 14863 if (isa<DbgInfoIntrinsic>(it)) 14864 continue; 14865 14866 // Try to vectorize reductions that use PHINodes. 14867 if (PHINode *P = dyn_cast<PHINode>(it)) { 14868 // Check that the PHI is a reduction PHI. 14869 if (P->getNumIncomingValues() == 2) { 14870 // Try to match and vectorize a horizontal reduction. 14871 Instruction *Root = getReductionInstr(DT, P, BB, LI); 14872 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) { 14873 Changed = true; 14874 it = BB->begin(); 14875 e = BB->end(); 14876 continue; 14877 } 14878 } 14879 // Try to vectorize the incoming values of the PHI, to catch reductions 14880 // that feed into PHIs. 14881 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { 14882 // Skip if the incoming block is the current BB for now. Also, bypass 14883 // unreachable IR for efficiency and to avoid crashing. 14884 // TODO: Collect the skipped incoming values and try to vectorize them 14885 // after processing BB. 14886 if (BB == P->getIncomingBlock(I) || 14887 !DT->isReachableFromEntry(P->getIncomingBlock(I))) 14888 continue; 14889 14890 // Postponed instructions should not be vectorized here, delay their 14891 // vectorization. 14892 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I)); 14893 PI && !IsInPostProcessInstrs(PI)) 14894 Changed |= vectorizeRootInstruction(nullptr, PI, 14895 P->getIncomingBlock(I), R, TTI); 14896 } 14897 continue; 14898 } 14899 14900 if (HasNoUsers(&*it)) { 14901 bool OpsChanged = false; 14902 auto *SI = dyn_cast<StoreInst>(it); 14903 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; 14904 if (SI) { 14905 auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); 14906 // Try to vectorize chain in store, if this is the only store to the 14907 // address in the block. 14908 // TODO: This is just a temporarily solution to save compile time. Need 14909 // to investigate if we can safely turn on slp-vectorize-hor-store 14910 // instead to allow lookup for reduction chains in all non-vectorized 14911 // stores (need to check side effects and compile time). 14912 TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) && 14913 SI->getValueOperand()->hasOneUse(); 14914 } 14915 if (TryToVectorizeRoot) { 14916 for (auto *V : it->operand_values()) { 14917 // Postponed instructions should not be vectorized here, delay their 14918 // vectorization. 14919 if (auto *VI = dyn_cast<Instruction>(V); 14920 VI && !IsInPostProcessInstrs(VI)) 14921 // Try to match and vectorize a horizontal reduction. 14922 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI); 14923 } 14924 } 14925 // Start vectorization of post-process list of instructions from the 14926 // top-tree instructions to try to vectorize as many instructions as 14927 // possible. 14928 OpsChanged |= 14929 VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator()); 14930 if (OpsChanged) { 14931 // We would like to start over since some instructions are deleted 14932 // and the iterator may become invalid value. 14933 Changed = true; 14934 it = BB->begin(); 14935 e = BB->end(); 14936 continue; 14937 } 14938 } 14939 14940 if (isa<InsertElementInst, InsertValueInst>(it)) 14941 PostProcessInserts.insert(&*it); 14942 else if (isa<CmpInst>(it)) 14943 PostProcessCmps.insert(cast<CmpInst>(&*it)); 14944 } 14945 14946 return Changed; 14947 } 14948 14949 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 14950 auto Changed = false; 14951 for (auto &Entry : GEPs) { 14952 // If the getelementptr list has fewer than two elements, there's nothing 14953 // to do. 14954 if (Entry.second.size() < 2) 14955 continue; 14956 14957 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 14958 << Entry.second.size() << ".\n"); 14959 14960 // Process the GEP list in chunks suitable for the target's supported 14961 // vector size. If a vector register can't hold 1 element, we are done. We 14962 // are trying to vectorize the index computations, so the maximum number of 14963 // elements is based on the size of the index expression, rather than the 14964 // size of the GEP itself (the target's pointer size). 14965 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 14966 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); 14967 if (MaxVecRegSize < EltSize) 14968 continue; 14969 14970 unsigned MaxElts = MaxVecRegSize / EltSize; 14971 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { 14972 auto Len = std::min<unsigned>(BE - BI, MaxElts); 14973 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); 14974 14975 // Initialize a set a candidate getelementptrs. Note that we use a 14976 // SetVector here to preserve program order. If the index computations 14977 // are vectorizable and begin with loads, we want to minimize the chance 14978 // of having to reorder them later. 14979 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 14980 14981 // Some of the candidates may have already been vectorized after we 14982 // initially collected them. If so, they are marked as deleted, so remove 14983 // them from the set of candidates. 14984 Candidates.remove_if( 14985 [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); 14986 14987 // Remove from the set of candidates all pairs of getelementptrs with 14988 // constant differences. Such getelementptrs are likely not good 14989 // candidates for vectorization in a bottom-up phase since one can be 14990 // computed from the other. We also ensure all candidate getelementptr 14991 // indices are unique. 14992 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 14993 auto *GEPI = GEPList[I]; 14994 if (!Candidates.count(GEPI)) 14995 continue; 14996 auto *SCEVI = SE->getSCEV(GEPList[I]); 14997 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 14998 auto *GEPJ = GEPList[J]; 14999 auto *SCEVJ = SE->getSCEV(GEPList[J]); 15000 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 15001 Candidates.remove(GEPI); 15002 Candidates.remove(GEPJ); 15003 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 15004 Candidates.remove(GEPJ); 15005 } 15006 } 15007 } 15008 15009 // We break out of the above computation as soon as we know there are 15010 // fewer than two candidates remaining. 15011 if (Candidates.size() < 2) 15012 continue; 15013 15014 // Add the single, non-constant index of each candidate to the bundle. We 15015 // ensured the indices met these constraints when we originally collected 15016 // the getelementptrs. 15017 SmallVector<Value *, 16> Bundle(Candidates.size()); 15018 auto BundleIndex = 0u; 15019 for (auto *V : Candidates) { 15020 auto *GEP = cast<GetElementPtrInst>(V); 15021 auto *GEPIdx = GEP->idx_begin()->get(); 15022 assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); 15023 Bundle[BundleIndex++] = GEPIdx; 15024 } 15025 15026 // Try and vectorize the indices. We are currently only interested in 15027 // gather-like cases of the form: 15028 // 15029 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 15030 // 15031 // where the loads of "a", the loads of "b", and the subtractions can be 15032 // performed in parallel. It's likely that detecting this pattern in a 15033 // bottom-up phase will be simpler and less costly than building a 15034 // full-blown top-down phase beginning at the consecutive loads. 15035 Changed |= tryToVectorizeList(Bundle, R); 15036 } 15037 } 15038 return Changed; 15039 } 15040 15041 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { 15042 bool Changed = false; 15043 // Sort by type, base pointers and values operand. Value operands must be 15044 // compatible (have the same opcode, same parent), otherwise it is 15045 // definitely not profitable to try to vectorize them. 15046 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { 15047 if (V->getPointerOperandType()->getTypeID() < 15048 V2->getPointerOperandType()->getTypeID()) 15049 return true; 15050 if (V->getPointerOperandType()->getTypeID() > 15051 V2->getPointerOperandType()->getTypeID()) 15052 return false; 15053 // UndefValues are compatible with all other values. 15054 if (isa<UndefValue>(V->getValueOperand()) || 15055 isa<UndefValue>(V2->getValueOperand())) 15056 return false; 15057 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) 15058 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 15059 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = 15060 DT->getNode(I1->getParent()); 15061 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = 15062 DT->getNode(I2->getParent()); 15063 assert(NodeI1 && "Should only process reachable instructions"); 15064 assert(NodeI2 && "Should only process reachable instructions"); 15065 assert((NodeI1 == NodeI2) == 15066 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 15067 "Different nodes should have different DFS numbers"); 15068 if (NodeI1 != NodeI2) 15069 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 15070 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 15071 if (S.getOpcode()) 15072 return false; 15073 return I1->getOpcode() < I2->getOpcode(); 15074 } 15075 if (isa<Constant>(V->getValueOperand()) && 15076 isa<Constant>(V2->getValueOperand())) 15077 return false; 15078 return V->getValueOperand()->getValueID() < 15079 V2->getValueOperand()->getValueID(); 15080 }; 15081 15082 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { 15083 if (V1 == V2) 15084 return true; 15085 if (V1->getPointerOperandType() != V2->getPointerOperandType()) 15086 return false; 15087 // Undefs are compatible with any other value. 15088 if (isa<UndefValue>(V1->getValueOperand()) || 15089 isa<UndefValue>(V2->getValueOperand())) 15090 return true; 15091 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) 15092 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 15093 if (I1->getParent() != I2->getParent()) 15094 return false; 15095 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 15096 return S.getOpcode() > 0; 15097 } 15098 if (isa<Constant>(V1->getValueOperand()) && 15099 isa<Constant>(V2->getValueOperand())) 15100 return true; 15101 return V1->getValueOperand()->getValueID() == 15102 V2->getValueOperand()->getValueID(); 15103 }; 15104 15105 // Attempt to sort and vectorize each of the store-groups. 15106 for (auto &Pair : Stores) { 15107 if (Pair.second.size() < 2) 15108 continue; 15109 15110 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 15111 << Pair.second.size() << ".\n"); 15112 15113 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) 15114 continue; 15115 15116 Changed |= tryToVectorizeSequence<StoreInst>( 15117 Pair.second, StoreSorter, AreCompatibleStores, 15118 [this, &R](ArrayRef<StoreInst *> Candidates, bool) { 15119 return vectorizeStores(Candidates, R); 15120 }, 15121 /*MaxVFOnly=*/false, R); 15122 } 15123 return Changed; 15124 } 15125