1 //===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines some vectorizer utilities. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_ANALYSIS_VECTORUTILS_H 14 #define LLVM_ANALYSIS_VECTORUTILS_H 15 16 #include "llvm/ADT/MapVector.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/Analysis/LoopAccessAnalysis.h" 19 #include "llvm/Support/CheckedArithmetic.h" 20 21 namespace llvm { 22 class TargetLibraryInfo; 23 24 /// Describes the type of Parameters 25 enum class VFParamKind { 26 Vector, // No semantic information. 27 OMP_Linear, // declare simd linear(i) 28 OMP_LinearRef, // declare simd linear(ref(i)) 29 OMP_LinearVal, // declare simd linear(val(i)) 30 OMP_LinearUVal, // declare simd linear(uval(i)) 31 OMP_LinearPos, // declare simd linear(i:c) uniform(c) 32 OMP_LinearValPos, // declare simd linear(val(i:c)) uniform(c) 33 OMP_LinearRefPos, // declare simd linear(ref(i:c)) uniform(c) 34 OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c) 35 OMP_Uniform, // declare simd uniform(i) 36 GlobalPredicate, // Global logical predicate that acts on all lanes 37 // of the input and output mask concurrently. For 38 // example, it is implied by the `M` token in the 39 // Vector Function ABI mangled name. 40 Unknown 41 }; 42 43 /// Describes the type of Instruction Set Architecture 44 enum class VFISAKind { 45 AdvancedSIMD, // AArch64 Advanced SIMD (NEON) 46 SVE, // AArch64 Scalable Vector Extension 47 SSE, // x86 SSE 48 AVX, // x86 AVX 49 AVX2, // x86 AVX2 50 AVX512, // x86 AVX512 51 LLVM, // LLVM internal ISA for functions that are not 52 // attached to an existing ABI via name mangling. 53 Unknown // Unknown ISA 54 }; 55 56 /// Encapsulates information needed to describe a parameter. 57 /// 58 /// The description of the parameter is not linked directly to 59 /// OpenMP or any other vector function description. This structure 60 /// is extendible to handle other paradigms that describe vector 61 /// functions and their parameters. 62 struct VFParameter { 63 unsigned ParamPos; // Parameter Position in Scalar Function. 64 VFParamKind ParamKind; // Kind of Parameter. 65 int LinearStepOrPos = 0; // Step or Position of the Parameter. 66 Align Alignment = Align(); // Optional alignment in bytes, defaulted to 1. 67 68 // Comparison operator. 69 bool operator==(const VFParameter &Other) const { 70 return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) == 71 std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos, 72 Other.Alignment); 73 } 74 }; 75 76 /// Contains the information about the kind of vectorization 77 /// available. 78 /// 79 /// This object in independent on the paradigm used to 80 /// represent vector functions. in particular, it is not attached to 81 /// any target-specific ABI. 82 struct VFShape { 83 ElementCount VF; // Vectorization factor. 84 SmallVector<VFParameter, 8> Parameters; // List of parameter information. 85 // Comparison operator. 86 bool operator==(const VFShape &Other) const { 87 return std::tie(VF, Parameters) == std::tie(Other.VF, Other.Parameters); 88 } 89 90 /// Update the parameter in position P.ParamPos to P. 91 void updateParam(VFParameter P) { 92 assert(P.ParamPos < Parameters.size() && "Invalid parameter position."); 93 Parameters[P.ParamPos] = P; 94 assert(hasValidParameterList() && "Invalid parameter list"); 95 } 96 97 // Retrieve the VFShape that can be used to map a (scalar) function to itself, 98 // with VF = 1. 99 static VFShape getScalarShape(const CallInst &CI) { 100 return VFShape::get(CI, ElementCount::getFixed(1), 101 /*HasGlobalPredicate*/ false); 102 } 103 104 // Retrieve the basic vectorization shape of the function, where all 105 // parameters are mapped to VFParamKind::Vector with \p EC 106 // lanes. Specifies whether the function has a Global Predicate 107 // argument via \p HasGlobalPred. 108 static VFShape get(const CallInst &CI, ElementCount EC, bool HasGlobalPred) { 109 SmallVector<VFParameter, 8> Parameters; 110 for (unsigned I = 0; I < CI.arg_size(); ++I) 111 Parameters.push_back(VFParameter({I, VFParamKind::Vector})); 112 if (HasGlobalPred) 113 Parameters.push_back( 114 VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate})); 115 116 return {EC, Parameters}; 117 } 118 /// Validation check on the Parameters in the VFShape. 119 bool hasValidParameterList() const; 120 }; 121 122 /// Holds the VFShape for a specific scalar to vector function mapping. 123 struct VFInfo { 124 VFShape Shape; /// Classification of the vector function. 125 std::string ScalarName; /// Scalar Function Name. 126 std::string VectorName; /// Vector Function Name associated to this VFInfo. 127 VFISAKind ISA; /// Instruction Set Architecture. 128 }; 129 130 namespace VFABI { 131 /// LLVM Internal VFABI ISA token for vector functions. 132 static constexpr char const *_LLVM_ = "_LLVM_"; 133 /// Prefix for internal name redirection for vector function that 134 /// tells the compiler to scalarize the call using the scalar name 135 /// of the function. For example, a mangled name like 136 /// `_ZGV_LLVM_N2v_foo(_LLVM_Scalarize_foo)` would tell the 137 /// vectorizer to vectorize the scalar call `foo`, and to scalarize 138 /// it once vectorization is done. 139 static constexpr char const *_LLVM_Scalarize_ = "_LLVM_Scalarize_"; 140 141 /// Function to construct a VFInfo out of a mangled names in the 142 /// following format: 143 /// 144 /// <VFABI_name>{(<redirection>)} 145 /// 146 /// where <VFABI_name> is the name of the vector function, mangled according 147 /// to the rules described in the Vector Function ABI of the target vector 148 /// extension (or <isa> from now on). The <VFABI_name> is in the following 149 /// format: 150 /// 151 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)] 152 /// 153 /// This methods support demangling rules for the following <isa>: 154 /// 155 /// * AArch64: https://developer.arm.com/docs/101129/latest 156 /// 157 /// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and 158 /// https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt 159 /// 160 /// \param MangledName -> input string in the format 161 /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]. 162 /// \param M -> Module used to retrieve informations about the vector 163 /// function that are not possible to retrieve from the mangled 164 /// name. At the moment, this parameter is needed only to retrieve the 165 /// Vectorization Factor of scalable vector functions from their 166 /// respective IR declarations. 167 Optional<VFInfo> tryDemangleForVFABI(StringRef MangledName, const Module &M); 168 169 /// This routine mangles the given VectorName according to the LangRef 170 /// specification for vector-function-abi-variant attribute and is specific to 171 /// the TLI mappings. It is the responsibility of the caller to make sure that 172 /// this is only used if all parameters in the vector function are vector type. 173 /// This returned string holds scalar-to-vector mapping: 174 /// _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>) 175 /// 176 /// where: 177 /// 178 /// <isa> = "_LLVM_" 179 /// <mask> = "N". Note: TLI does not support masked interfaces. 180 /// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor` 181 /// field of the `VecDesc` struct. If the number of lanes is scalable 182 /// then 'x' is printed instead. 183 /// <vparams> = "v", as many as are the numArgs. 184 /// <scalarname> = the name of the scalar function. 185 /// <vectorname> = the name of the vector function. 186 std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName, 187 unsigned numArgs, ElementCount VF); 188 189 /// Retrieve the `VFParamKind` from a string token. 190 VFParamKind getVFParamKindFromString(const StringRef Token); 191 192 // Name of the attribute where the variant mappings are stored. 193 static constexpr char const *MappingsAttrName = "vector-function-abi-variant"; 194 195 /// Populates a set of strings representing the Vector Function ABI variants 196 /// associated to the CallInst CI. If the CI does not contain the 197 /// vector-function-abi-variant attribute, we return without populating 198 /// VariantMappings, i.e. callers of getVectorVariantNames need not check for 199 /// the presence of the attribute (see InjectTLIMappings). 200 void getVectorVariantNames(const CallInst &CI, 201 SmallVectorImpl<std::string> &VariantMappings); 202 } // end namespace VFABI 203 204 /// The Vector Function Database. 205 /// 206 /// Helper class used to find the vector functions associated to a 207 /// scalar CallInst. 208 class VFDatabase { 209 /// The Module of the CallInst CI. 210 const Module *M; 211 /// The CallInst instance being queried for scalar to vector mappings. 212 const CallInst &CI; 213 /// List of vector functions descriptors associated to the call 214 /// instruction. 215 const SmallVector<VFInfo, 8> ScalarToVectorMappings; 216 217 /// Retrieve the scalar-to-vector mappings associated to the rule of 218 /// a vector Function ABI. 219 static void getVFABIMappings(const CallInst &CI, 220 SmallVectorImpl<VFInfo> &Mappings) { 221 if (!CI.getCalledFunction()) 222 return; 223 224 const StringRef ScalarName = CI.getCalledFunction()->getName(); 225 226 SmallVector<std::string, 8> ListOfStrings; 227 // The check for the vector-function-abi-variant attribute is done when 228 // retrieving the vector variant names here. 229 VFABI::getVectorVariantNames(CI, ListOfStrings); 230 if (ListOfStrings.empty()) 231 return; 232 for (const auto &MangledName : ListOfStrings) { 233 const Optional<VFInfo> Shape = 234 VFABI::tryDemangleForVFABI(MangledName, *(CI.getModule())); 235 // A match is found via scalar and vector names, and also by 236 // ensuring that the variant described in the attribute has a 237 // corresponding definition or declaration of the vector 238 // function in the Module M. 239 if (Shape && (Shape.value().ScalarName == ScalarName)) { 240 assert(CI.getModule()->getFunction(Shape.value().VectorName) && 241 "Vector function is missing."); 242 Mappings.push_back(Shape.value()); 243 } 244 } 245 } 246 247 public: 248 /// Retrieve all the VFInfo instances associated to the CallInst CI. 249 static SmallVector<VFInfo, 8> getMappings(const CallInst &CI) { 250 SmallVector<VFInfo, 8> Ret; 251 252 // Get mappings from the Vector Function ABI variants. 253 getVFABIMappings(CI, Ret); 254 255 // Other non-VFABI variants should be retrieved here. 256 257 return Ret; 258 } 259 260 /// Constructor, requires a CallInst instance. 261 VFDatabase(CallInst &CI) 262 : M(CI.getModule()), CI(CI), 263 ScalarToVectorMappings(VFDatabase::getMappings(CI)) {} 264 /// \defgroup VFDatabase query interface. 265 /// 266 /// @{ 267 /// Retrieve the Function with VFShape \p Shape. 268 Function *getVectorizedFunction(const VFShape &Shape) const { 269 if (Shape == VFShape::getScalarShape(CI)) 270 return CI.getCalledFunction(); 271 272 for (const auto &Info : ScalarToVectorMappings) 273 if (Info.Shape == Shape) 274 return M->getFunction(Info.VectorName); 275 276 return nullptr; 277 } 278 /// @} 279 }; 280 281 template <typename T> class ArrayRef; 282 class DemandedBits; 283 class GetElementPtrInst; 284 template <typename InstTy> class InterleaveGroup; 285 class IRBuilderBase; 286 class Loop; 287 class ScalarEvolution; 288 class TargetTransformInfo; 289 class Type; 290 class Value; 291 292 namespace Intrinsic { 293 typedef unsigned ID; 294 } 295 296 /// A helper function for converting Scalar types to vector types. If 297 /// the incoming type is void, we return void. If the EC represents a 298 /// scalar, we return the scalar type. 299 inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { 300 if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) 301 return Scalar; 302 return VectorType::get(Scalar, EC); 303 } 304 305 inline Type *ToVectorTy(Type *Scalar, unsigned VF) { 306 return ToVectorTy(Scalar, ElementCount::getFixed(VF)); 307 } 308 309 /// Identify if the intrinsic is trivially vectorizable. 310 /// This method returns true if the intrinsic's argument types are all scalars 311 /// for the scalar form of the intrinsic and all vectors (or scalars handled by 312 /// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic. 313 bool isTriviallyVectorizable(Intrinsic::ID ID); 314 315 /// Identifies if the vector form of the intrinsic has a scalar operand. 316 bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, 317 unsigned ScalarOpdIdx); 318 319 /// Identifies if the vector form of the intrinsic has a operand that has 320 /// an overloaded type. 321 bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx); 322 323 /// Returns intrinsic ID for call. 324 /// For the input call instruction it finds mapping intrinsic and returns 325 /// its intrinsic ID, in case it does not found it return not_intrinsic. 326 Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, 327 const TargetLibraryInfo *TLI); 328 329 /// Find the operand of the GEP that should be checked for consecutive 330 /// stores. This ignores trailing indices that have no effect on the final 331 /// pointer. 332 unsigned getGEPInductionOperand(const GetElementPtrInst *Gep); 333 334 /// If the argument is a GEP, then returns the operand identified by 335 /// getGEPInductionOperand. However, if there is some other non-loop-invariant 336 /// operand, it returns that instead. 337 Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp); 338 339 /// If a value has only one user that is a CastInst, return it. 340 Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty); 341 342 /// Get the stride of a pointer access in a loop. Looks for symbolic 343 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. 344 Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp); 345 346 /// Given a vector and an element number, see if the scalar value is 347 /// already around as a register, for example if it were inserted then extracted 348 /// from the vector. 349 Value *findScalarElement(Value *V, unsigned EltNo); 350 351 /// If all non-negative \p Mask elements are the same value, return that value. 352 /// If all elements are negative (undefined) or \p Mask contains different 353 /// non-negative values, return -1. 354 int getSplatIndex(ArrayRef<int> Mask); 355 356 /// Get splat value if the input is a splat vector or return nullptr. 357 /// The value may be extracted from a splat constants vector or from 358 /// a sequence of instructions that broadcast a single value into a vector. 359 Value *getSplatValue(const Value *V); 360 361 /// Return true if each element of the vector value \p V is poisoned or equal to 362 /// every other non-poisoned element. If an index element is specified, either 363 /// every element of the vector is poisoned or the element at that index is not 364 /// poisoned and equal to every other non-poisoned element. 365 /// This may be more powerful than the related getSplatValue() because it is 366 /// not limited by finding a scalar source value to a splatted vector. 367 bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0); 368 369 /// Replace each shuffle mask index with the scaled sequential indices for an 370 /// equivalent mask of narrowed elements. Mask elements that are less than 0 371 /// (sentinel values) are repeated in the output mask. 372 /// 373 /// Example with Scale = 4: 374 /// <4 x i32> <3, 2, 0, -1> --> 375 /// <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> 376 /// 377 /// This is the reverse process of widening shuffle mask elements, but it always 378 /// succeeds because the indexes can always be multiplied (scaled up) to map to 379 /// narrower vector elements. 380 void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask, 381 SmallVectorImpl<int> &ScaledMask); 382 383 /// Try to transform a shuffle mask by replacing elements with the scaled index 384 /// for an equivalent mask of widened elements. If all mask elements that would 385 /// map to a wider element of the new mask are the same negative number 386 /// (sentinel value), that element of the new mask is the same value. If any 387 /// element in a given slice is negative and some other element in that slice is 388 /// not the same value, return false (partial matches with sentinel values are 389 /// not allowed). 390 /// 391 /// Example with Scale = 4: 392 /// <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> --> 393 /// <4 x i32> <3, 2, 0, -1> 394 /// 395 /// This is the reverse process of narrowing shuffle mask elements if it 396 /// succeeds. This transform is not always possible because indexes may not 397 /// divide evenly (scale down) to map to wider vector elements. 398 bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask, 399 SmallVectorImpl<int> &ScaledMask); 400 401 /// Splits and processes shuffle mask depending on the number of input and 402 /// output registers. The function does 2 main things: 1) splits the 403 /// source/destination vectors into real registers; 2) do the mask analysis to 404 /// identify which real registers are permuted. Then the function processes 405 /// resulting registers mask using provided action items. If no input register 406 /// is defined, \p NoInputAction action is used. If only 1 input register is 407 /// used, \p SingleInputAction is used, otherwise \p ManyInputsAction is used to 408 /// process > 2 input registers and masks. 409 /// \param Mask Original shuffle mask. 410 /// \param NumOfSrcRegs Number of source registers. 411 /// \param NumOfDestRegs Number of destination registers. 412 /// \param NumOfUsedRegs Number of actually used destination registers. 413 void processShuffleMasks( 414 ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, 415 unsigned NumOfUsedRegs, function_ref<void()> NoInputAction, 416 function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction, 417 function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction); 418 419 /// Compute a map of integer instructions to their minimum legal type 420 /// size. 421 /// 422 /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int 423 /// type (e.g. i32) whenever arithmetic is performed on them. 424 /// 425 /// For targets with native i8 or i16 operations, usually InstCombine can shrink 426 /// the arithmetic type down again. However InstCombine refuses to create 427 /// illegal types, so for targets without i8 or i16 registers, the lengthening 428 /// and shrinking remains. 429 /// 430 /// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when 431 /// their scalar equivalents do not, so during vectorization it is important to 432 /// remove these lengthens and truncates when deciding the profitability of 433 /// vectorization. 434 /// 435 /// This function analyzes the given range of instructions and determines the 436 /// minimum type size each can be converted to. It attempts to remove or 437 /// minimize type size changes across each def-use chain, so for example in the 438 /// following code: 439 /// 440 /// %1 = load i8, i8* 441 /// %2 = add i8 %1, 2 442 /// %3 = load i16, i16* 443 /// %4 = zext i8 %2 to i32 444 /// %5 = zext i16 %3 to i32 445 /// %6 = add i32 %4, %5 446 /// %7 = trunc i32 %6 to i16 447 /// 448 /// Instruction %6 must be done at least in i16, so computeMinimumValueSizes 449 /// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}. 450 /// 451 /// If the optional TargetTransformInfo is provided, this function tries harder 452 /// to do less work by only looking at illegal types. 453 MapVector<Instruction*, uint64_t> 454 computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks, 455 DemandedBits &DB, 456 const TargetTransformInfo *TTI=nullptr); 457 458 /// Compute the union of two access-group lists. 459 /// 460 /// If the list contains just one access group, it is returned directly. If the 461 /// list is empty, returns nullptr. 462 MDNode *uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2); 463 464 /// Compute the access-group list of access groups that @p Inst1 and @p Inst2 465 /// are both in. If either instruction does not access memory at all, it is 466 /// considered to be in every list. 467 /// 468 /// If the list contains just one access group, it is returned directly. If the 469 /// list is empty, returns nullptr. 470 MDNode *intersectAccessGroups(const Instruction *Inst1, 471 const Instruction *Inst2); 472 473 /// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, 474 /// MD_nontemporal, MD_access_group]. 475 /// For K in Kinds, we get the MDNode for K from each of the 476 /// elements of VL, compute their "intersection" (i.e., the most generic 477 /// metadata value that covers all of the individual values), and set I's 478 /// metadata for M equal to the intersection value. 479 /// 480 /// This function always sets a (possibly null) value for each K in Kinds. 481 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL); 482 483 /// Create a mask that filters the members of an interleave group where there 484 /// are gaps. 485 /// 486 /// For example, the mask for \p Group with interleave-factor 3 487 /// and \p VF 4, that has only its first member present is: 488 /// 489 /// <1,0,0,1,0,0,1,0,0,1,0,0> 490 /// 491 /// Note: The result is a mask of 0's and 1's, as opposed to the other 492 /// create[*]Mask() utilities which create a shuffle mask (mask that 493 /// consists of indices). 494 Constant *createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, 495 const InterleaveGroup<Instruction> &Group); 496 497 /// Create a mask with replicated elements. 498 /// 499 /// This function creates a shuffle mask for replicating each of the \p VF 500 /// elements in a vector \p ReplicationFactor times. It can be used to 501 /// transform a mask of \p VF elements into a mask of 502 /// \p VF * \p ReplicationFactor elements used by a predicated 503 /// interleaved-group of loads/stores whose Interleaved-factor == 504 /// \p ReplicationFactor. 505 /// 506 /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: 507 /// 508 /// <0,0,0,1,1,1,2,2,2,3,3,3> 509 llvm::SmallVector<int, 16> createReplicatedMask(unsigned ReplicationFactor, 510 unsigned VF); 511 512 /// Create an interleave shuffle mask. 513 /// 514 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of 515 /// vectorization factor \p VF into a single wide vector. The mask is of the 516 /// form: 517 /// 518 /// <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...> 519 /// 520 /// For example, the mask for VF = 4 and NumVecs = 2 is: 521 /// 522 /// <0, 4, 1, 5, 2, 6, 3, 7>. 523 llvm::SmallVector<int, 16> createInterleaveMask(unsigned VF, unsigned NumVecs); 524 525 /// Create a stride shuffle mask. 526 /// 527 /// This function creates a shuffle mask whose elements begin at \p Start and 528 /// are incremented by \p Stride. The mask can be used to deinterleave an 529 /// interleaved vector into separate vectors of vectorization factor \p VF. The 530 /// mask is of the form: 531 /// 532 /// <Start, Start + Stride, ..., Start + Stride * (VF - 1)> 533 /// 534 /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is: 535 /// 536 /// <0, 2, 4, 6> 537 llvm::SmallVector<int, 16> createStrideMask(unsigned Start, unsigned Stride, 538 unsigned VF); 539 540 /// Create a sequential shuffle mask. 541 /// 542 /// This function creates shuffle mask whose elements are sequential and begin 543 /// at \p Start. The mask contains \p NumInts integers and is padded with \p 544 /// NumUndefs undef values. The mask is of the form: 545 /// 546 /// <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs> 547 /// 548 /// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is: 549 /// 550 /// <0, 1, 2, 3, undef, undef, undef, undef> 551 llvm::SmallVector<int, 16> 552 createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs); 553 554 /// Given a shuffle mask for a binary shuffle, create the equivalent shuffle 555 /// mask assuming both operands are identical. This assumes that the unary 556 /// shuffle will use elements from operand 0 (operand 1 will be unused). 557 llvm::SmallVector<int, 16> createUnaryMask(ArrayRef<int> Mask, 558 unsigned NumElts); 559 560 /// Concatenate a list of vectors. 561 /// 562 /// This function generates code that concatenate the vectors in \p Vecs into a 563 /// single large vector. The number of vectors should be greater than one, and 564 /// their element types should be the same. The number of elements in the 565 /// vectors should also be the same; however, if the last vector has fewer 566 /// elements, it will be padded with undefs. 567 Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vecs); 568 569 /// Given a mask vector of i1, Return true if all of the elements of this 570 /// predicate mask are known to be false or undef. That is, return true if all 571 /// lanes can be assumed inactive. 572 bool maskIsAllZeroOrUndef(Value *Mask); 573 574 /// Given a mask vector of i1, Return true if all of the elements of this 575 /// predicate mask are known to be true or undef. That is, return true if all 576 /// lanes can be assumed active. 577 bool maskIsAllOneOrUndef(Value *Mask); 578 579 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) 580 /// for each lane which may be active. 581 APInt possiblyDemandedEltsInMask(Value *Mask); 582 583 /// The group of interleaved loads/stores sharing the same stride and 584 /// close to each other. 585 /// 586 /// Each member in this group has an index starting from 0, and the largest 587 /// index should be less than interleaved factor, which is equal to the absolute 588 /// value of the access's stride. 589 /// 590 /// E.g. An interleaved load group of factor 4: 591 /// for (unsigned i = 0; i < 1024; i+=4) { 592 /// a = A[i]; // Member of index 0 593 /// b = A[i+1]; // Member of index 1 594 /// d = A[i+3]; // Member of index 3 595 /// ... 596 /// } 597 /// 598 /// An interleaved store group of factor 4: 599 /// for (unsigned i = 0; i < 1024; i+=4) { 600 /// ... 601 /// A[i] = a; // Member of index 0 602 /// A[i+1] = b; // Member of index 1 603 /// A[i+2] = c; // Member of index 2 604 /// A[i+3] = d; // Member of index 3 605 /// } 606 /// 607 /// Note: the interleaved load group could have gaps (missing members), but 608 /// the interleaved store group doesn't allow gaps. 609 template <typename InstTy> class InterleaveGroup { 610 public: 611 InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment) 612 : Factor(Factor), Reverse(Reverse), Alignment(Alignment), 613 InsertPos(nullptr) {} 614 615 InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment) 616 : Alignment(Alignment), InsertPos(Instr) { 617 Factor = std::abs(Stride); 618 assert(Factor > 1 && "Invalid interleave factor"); 619 620 Reverse = Stride < 0; 621 Members[0] = Instr; 622 } 623 624 bool isReverse() const { return Reverse; } 625 uint32_t getFactor() const { return Factor; } 626 Align getAlign() const { return Alignment; } 627 uint32_t getNumMembers() const { return Members.size(); } 628 629 /// Try to insert a new member \p Instr with index \p Index and 630 /// alignment \p NewAlign. The index is related to the leader and it could be 631 /// negative if it is the new leader. 632 /// 633 /// \returns false if the instruction doesn't belong to the group. 634 bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) { 635 // Make sure the key fits in an int32_t. 636 Optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey); 637 if (!MaybeKey) 638 return false; 639 int32_t Key = *MaybeKey; 640 641 // Skip if the key is used for either the tombstone or empty special values. 642 if (DenseMapInfo<int32_t>::getTombstoneKey() == Key || 643 DenseMapInfo<int32_t>::getEmptyKey() == Key) 644 return false; 645 646 // Skip if there is already a member with the same index. 647 if (Members.find(Key) != Members.end()) 648 return false; 649 650 if (Key > LargestKey) { 651 // The largest index is always less than the interleave factor. 652 if (Index >= static_cast<int32_t>(Factor)) 653 return false; 654 655 LargestKey = Key; 656 } else if (Key < SmallestKey) { 657 658 // Make sure the largest index fits in an int32_t. 659 Optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key); 660 if (!MaybeLargestIndex) 661 return false; 662 663 // The largest index is always less than the interleave factor. 664 if (*MaybeLargestIndex >= static_cast<int64_t>(Factor)) 665 return false; 666 667 SmallestKey = Key; 668 } 669 670 // It's always safe to select the minimum alignment. 671 Alignment = std::min(Alignment, NewAlign); 672 Members[Key] = Instr; 673 return true; 674 } 675 676 /// Get the member with the given index \p Index 677 /// 678 /// \returns nullptr if contains no such member. 679 InstTy *getMember(uint32_t Index) const { 680 int32_t Key = SmallestKey + Index; 681 return Members.lookup(Key); 682 } 683 684 /// Get the index for the given member. Unlike the key in the member 685 /// map, the index starts from 0. 686 uint32_t getIndex(const InstTy *Instr) const { 687 for (auto I : Members) { 688 if (I.second == Instr) 689 return I.first - SmallestKey; 690 } 691 692 llvm_unreachable("InterleaveGroup contains no such member"); 693 } 694 695 InstTy *getInsertPos() const { return InsertPos; } 696 void setInsertPos(InstTy *Inst) { InsertPos = Inst; } 697 698 /// Add metadata (e.g. alias info) from the instructions in this group to \p 699 /// NewInst. 700 /// 701 /// FIXME: this function currently does not add noalias metadata a'la 702 /// addNewMedata. To do that we need to compute the intersection of the 703 /// noalias info from all members. 704 void addMetadata(InstTy *NewInst) const; 705 706 /// Returns true if this Group requires a scalar iteration to handle gaps. 707 bool requiresScalarEpilogue() const { 708 // If the last member of the Group exists, then a scalar epilog is not 709 // needed for this group. 710 if (getMember(getFactor() - 1)) 711 return false; 712 713 // We have a group with gaps. It therefore can't be a reversed access, 714 // because such groups get invalidated (TODO). 715 assert(!isReverse() && "Group should have been invalidated"); 716 717 // This is a group of loads, with gaps, and without a last-member 718 return true; 719 } 720 721 private: 722 uint32_t Factor; // Interleave Factor. 723 bool Reverse; 724 Align Alignment; 725 DenseMap<int32_t, InstTy *> Members; 726 int32_t SmallestKey = 0; 727 int32_t LargestKey = 0; 728 729 // To avoid breaking dependences, vectorized instructions of an interleave 730 // group should be inserted at either the first load or the last store in 731 // program order. 732 // 733 // E.g. %even = load i32 // Insert Position 734 // %add = add i32 %even // Use of %even 735 // %odd = load i32 736 // 737 // store i32 %even 738 // %odd = add i32 // Def of %odd 739 // store i32 %odd // Insert Position 740 InstTy *InsertPos; 741 }; 742 743 /// Drive the analysis of interleaved memory accesses in the loop. 744 /// 745 /// Use this class to analyze interleaved accesses only when we can vectorize 746 /// a loop. Otherwise it's meaningless to do analysis as the vectorization 747 /// on interleaved accesses is unsafe. 748 /// 749 /// The analysis collects interleave groups and records the relationships 750 /// between the member and the group in a map. 751 class InterleavedAccessInfo { 752 public: 753 InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, 754 DominatorTree *DT, LoopInfo *LI, 755 const LoopAccessInfo *LAI) 756 : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} 757 758 ~InterleavedAccessInfo() { invalidateGroups(); } 759 760 /// Analyze the interleaved accesses and collect them in interleave 761 /// groups. Substitute symbolic strides using \p Strides. 762 /// Consider also predicated loads/stores in the analysis if 763 /// \p EnableMaskedInterleavedGroup is true. 764 void analyzeInterleaving(bool EnableMaskedInterleavedGroup); 765 766 /// Invalidate groups, e.g., in case all blocks in loop will be predicated 767 /// contrary to original assumption. Although we currently prevent group 768 /// formation for predicated accesses, we may be able to relax this limitation 769 /// in the future once we handle more complicated blocks. Returns true if any 770 /// groups were invalidated. 771 bool invalidateGroups() { 772 if (InterleaveGroups.empty()) { 773 assert( 774 !RequiresScalarEpilogue && 775 "RequiresScalarEpilog should not be set without interleave groups"); 776 return false; 777 } 778 779 InterleaveGroupMap.clear(); 780 for (auto *Ptr : InterleaveGroups) 781 delete Ptr; 782 InterleaveGroups.clear(); 783 RequiresScalarEpilogue = false; 784 return true; 785 } 786 787 /// Check if \p Instr belongs to any interleave group. 788 bool isInterleaved(Instruction *Instr) const { 789 return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end(); 790 } 791 792 /// Get the interleave group that \p Instr belongs to. 793 /// 794 /// \returns nullptr if doesn't have such group. 795 InterleaveGroup<Instruction> * 796 getInterleaveGroup(const Instruction *Instr) const { 797 return InterleaveGroupMap.lookup(Instr); 798 } 799 800 iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>> 801 getInterleaveGroups() { 802 return make_range(InterleaveGroups.begin(), InterleaveGroups.end()); 803 } 804 805 /// Returns true if an interleaved group that may access memory 806 /// out-of-bounds requires a scalar epilogue iteration for correctness. 807 bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } 808 809 /// Invalidate groups that require a scalar epilogue (due to gaps). This can 810 /// happen when optimizing for size forbids a scalar epilogue, and the gap 811 /// cannot be filtered by masking the load/store. 812 void invalidateGroupsRequiringScalarEpilogue(); 813 814 private: 815 /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. 816 /// Simplifies SCEV expressions in the context of existing SCEV assumptions. 817 /// The interleaved access analysis can also add new predicates (for example 818 /// by versioning strides of pointers). 819 PredicatedScalarEvolution &PSE; 820 821 Loop *TheLoop; 822 DominatorTree *DT; 823 LoopInfo *LI; 824 const LoopAccessInfo *LAI; 825 826 /// True if the loop may contain non-reversed interleaved groups with 827 /// out-of-bounds accesses. We ensure we don't speculatively access memory 828 /// out-of-bounds by executing at least one scalar epilogue iteration. 829 bool RequiresScalarEpilogue = false; 830 831 /// Holds the relationships between the members and the interleave group. 832 DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap; 833 834 SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups; 835 836 /// Holds dependences among the memory accesses in the loop. It maps a source 837 /// access to a set of dependent sink accesses. 838 DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; 839 840 /// The descriptor for a strided memory access. 841 struct StrideDescriptor { 842 StrideDescriptor() = default; 843 StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, 844 Align Alignment) 845 : Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {} 846 847 // The access's stride. It is negative for a reverse access. 848 int64_t Stride = 0; 849 850 // The scalar expression of this access. 851 const SCEV *Scev = nullptr; 852 853 // The size of the memory object. 854 uint64_t Size = 0; 855 856 // The alignment of this access. 857 Align Alignment; 858 }; 859 860 /// A type for holding instructions and their stride descriptors. 861 using StrideEntry = std::pair<Instruction *, StrideDescriptor>; 862 863 /// Create a new interleave group with the given instruction \p Instr, 864 /// stride \p Stride and alignment \p Align. 865 /// 866 /// \returns the newly created interleave group. 867 InterleaveGroup<Instruction> * 868 createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) { 869 assert(!InterleaveGroupMap.count(Instr) && 870 "Already in an interleaved access group"); 871 InterleaveGroupMap[Instr] = 872 new InterleaveGroup<Instruction>(Instr, Stride, Alignment); 873 InterleaveGroups.insert(InterleaveGroupMap[Instr]); 874 return InterleaveGroupMap[Instr]; 875 } 876 877 /// Release the group and remove all the relationships. 878 void releaseGroup(InterleaveGroup<Instruction> *Group) { 879 for (unsigned i = 0; i < Group->getFactor(); i++) 880 if (Instruction *Member = Group->getMember(i)) 881 InterleaveGroupMap.erase(Member); 882 883 InterleaveGroups.erase(Group); 884 delete Group; 885 } 886 887 /// Collect all the accesses with a constant stride in program order. 888 void collectConstStrideAccesses( 889 MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, 890 const ValueToValueMap &Strides); 891 892 /// Returns true if \p Stride is allowed in an interleaved group. 893 static bool isStrided(int Stride); 894 895 /// Returns true if \p BB is a predicated block. 896 bool isPredicated(BasicBlock *BB) const { 897 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 898 } 899 900 /// Returns true if LoopAccessInfo can be used for dependence queries. 901 bool areDependencesValid() const { 902 return LAI && LAI->getDepChecker().getDependences(); 903 } 904 905 /// Returns true if memory accesses \p A and \p B can be reordered, if 906 /// necessary, when constructing interleaved groups. 907 /// 908 /// \p A must precede \p B in program order. We return false if reordering is 909 /// not necessary or is prevented because \p A and \p B may be dependent. 910 bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, 911 StrideEntry *B) const { 912 // Code motion for interleaved accesses can potentially hoist strided loads 913 // and sink strided stores. The code below checks the legality of the 914 // following two conditions: 915 // 916 // 1. Potentially moving a strided load (B) before any store (A) that 917 // precedes B, or 918 // 919 // 2. Potentially moving a strided store (A) after any load or store (B) 920 // that A precedes. 921 // 922 // It's legal to reorder A and B if we know there isn't a dependence from A 923 // to B. Note that this determination is conservative since some 924 // dependences could potentially be reordered safely. 925 926 // A is potentially the source of a dependence. 927 auto *Src = A->first; 928 auto SrcDes = A->second; 929 930 // B is potentially the sink of a dependence. 931 auto *Sink = B->first; 932 auto SinkDes = B->second; 933 934 // Code motion for interleaved accesses can't violate WAR dependences. 935 // Thus, reordering is legal if the source isn't a write. 936 if (!Src->mayWriteToMemory()) 937 return true; 938 939 // At least one of the accesses must be strided. 940 if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) 941 return true; 942 943 // If dependence information is not available from LoopAccessInfo, 944 // conservatively assume the instructions can't be reordered. 945 if (!areDependencesValid()) 946 return false; 947 948 // If we know there is a dependence from source to sink, assume the 949 // instructions can't be reordered. Otherwise, reordering is legal. 950 return Dependences.find(Src) == Dependences.end() || 951 !Dependences.lookup(Src).count(Sink); 952 } 953 954 /// Collect the dependences from LoopAccessInfo. 955 /// 956 /// We process the dependences once during the interleaved access analysis to 957 /// enable constant-time dependence queries. 958 void collectDependences() { 959 if (!areDependencesValid()) 960 return; 961 auto *Deps = LAI->getDepChecker().getDependences(); 962 for (auto Dep : *Deps) 963 Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); 964 } 965 }; 966 967 } // llvm namespace 968 969 #endif 970