1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 21 #include "llvm/Analysis/Loads.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/Attributes.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Intrinsics.h" 37 #include "llvm/IR/LLVMContext.h" 38 #include "llvm/IR/Operator.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/Casting.h" 43 #include <cassert> 44 #include <iterator> 45 46 #define DEBUG_TYPE "amdgpu-codegenprepare" 47 48 using namespace llvm; 49 50 namespace { 51 52 static cl::opt<bool> WidenLoads( 53 "amdgpu-codegenprepare-widen-constant-loads", 54 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 55 cl::ReallyHidden, 56 cl::init(true)); 57 58 class AMDGPUCodeGenPrepare : public FunctionPass, 59 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 60 const GCNSubtarget *ST = nullptr; 61 AssumptionCache *AC = nullptr; 62 LegacyDivergenceAnalysis *DA = nullptr; 63 Module *Mod = nullptr; 64 const DataLayout *DL = nullptr; 65 bool HasUnsafeFPMath = false; 66 67 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 68 /// binary operation \p V. 69 /// 70 /// \returns Binary operation \p V. 71 /// \returns \p T's base element bit width. 72 unsigned getBaseElementBitWidth(const Type *T) const; 73 74 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 75 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 76 /// is returned. 77 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 78 79 /// \returns True if binary operation \p I is a signed binary operation, false 80 /// otherwise. 81 bool isSigned(const BinaryOperator &I) const; 82 83 /// \returns True if the condition of 'select' operation \p I comes from a 84 /// signed 'icmp' operation, false otherwise. 85 bool isSigned(const SelectInst &I) const; 86 87 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 88 /// false otherwise. 89 bool needsPromotionToI32(const Type *T) const; 90 91 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 92 /// operation. 93 /// 94 /// \details \p I's base element bit width must be greater than 1 and less 95 /// than or equal 16. Promotion is done by sign or zero extending operands to 96 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 97 /// truncating the result of 32 bit binary operation back to \p I's original 98 /// type. Division operation is not promoted. 99 /// 100 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 101 /// false otherwise. 102 bool promoteUniformOpToI32(BinaryOperator &I) const; 103 104 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 105 /// 106 /// \details \p I's base element bit width must be greater than 1 and less 107 /// than or equal 16. Promotion is done by sign or zero extending operands to 108 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 109 /// 110 /// \returns True. 111 bool promoteUniformOpToI32(ICmpInst &I) const; 112 113 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 114 /// operation. 115 /// 116 /// \details \p I's base element bit width must be greater than 1 and less 117 /// than or equal 16. Promotion is done by sign or zero extending operands to 118 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 119 /// result of 32 bit 'select' operation back to \p I's original type. 120 /// 121 /// \returns True. 122 bool promoteUniformOpToI32(SelectInst &I) const; 123 124 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 125 /// intrinsic. 126 /// 127 /// \details \p I's base element bit width must be greater than 1 and less 128 /// than or equal 16. Promotion is done by zero extending the operand to 32 129 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 130 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 131 /// shift amount is 32 minus \p I's base element bit width), and truncating 132 /// the result of the shift operation back to \p I's original type. 133 /// 134 /// \returns True. 135 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 136 137 138 unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; 139 unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; 140 bool isI24(Value *V, unsigned ScalarSize) const; 141 bool isU24(Value *V, unsigned ScalarSize) const; 142 143 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. 144 /// SelectionDAG has an issue where an and asserting the bits are known 145 bool replaceMulWithMul24(BinaryOperator &I) const; 146 147 /// Expands 24 bit div or rem. 148 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 149 Value *Num, Value *Den, 150 bool IsDiv, bool IsSigned) const; 151 152 /// Expands 32 bit div or rem. 153 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 154 Value *Num, Value *Den) const; 155 156 /// Widen a scalar load. 157 /// 158 /// \details \p Widen scalar load for uniform, small type loads from constant 159 // memory / to a full 32-bits and then truncate the input to allow a scalar 160 // load instead of a vector load. 161 // 162 /// \returns True. 163 164 bool canWidenScalarExtLoad(LoadInst &I) const; 165 166 public: 167 static char ID; 168 169 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 170 171 bool visitFDiv(BinaryOperator &I); 172 173 bool visitInstruction(Instruction &I) { return false; } 174 bool visitBinaryOperator(BinaryOperator &I); 175 bool visitLoadInst(LoadInst &I); 176 bool visitICmpInst(ICmpInst &I); 177 bool visitSelectInst(SelectInst &I); 178 179 bool visitIntrinsicInst(IntrinsicInst &I); 180 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 181 182 bool doInitialization(Module &M) override; 183 bool runOnFunction(Function &F) override; 184 185 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 186 187 void getAnalysisUsage(AnalysisUsage &AU) const override { 188 AU.addRequired<AssumptionCacheTracker>(); 189 AU.addRequired<LegacyDivergenceAnalysis>(); 190 AU.setPreservesAll(); 191 } 192 }; 193 194 } // end anonymous namespace 195 196 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 197 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 198 199 if (T->isIntegerTy()) 200 return T->getIntegerBitWidth(); 201 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 202 } 203 204 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 205 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 206 207 if (T->isIntegerTy()) 208 return B.getInt32Ty(); 209 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 210 } 211 212 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 213 return I.getOpcode() == Instruction::AShr || 214 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 215 } 216 217 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 218 return isa<ICmpInst>(I.getOperand(0)) ? 219 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 220 } 221 222 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 223 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 224 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 225 return true; 226 227 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 228 // TODO: The set of packed operations is more limited, so may want to 229 // promote some anyway. 230 if (ST->hasVOP3PInsts()) 231 return false; 232 233 return needsPromotionToI32(VT->getElementType()); 234 } 235 236 return false; 237 } 238 239 // Return true if the op promoted to i32 should have nsw set. 240 static bool promotedOpIsNSW(const Instruction &I) { 241 switch (I.getOpcode()) { 242 case Instruction::Shl: 243 case Instruction::Add: 244 case Instruction::Sub: 245 return true; 246 case Instruction::Mul: 247 return I.hasNoUnsignedWrap(); 248 default: 249 return false; 250 } 251 } 252 253 // Return true if the op promoted to i32 should have nuw set. 254 static bool promotedOpIsNUW(const Instruction &I) { 255 switch (I.getOpcode()) { 256 case Instruction::Shl: 257 case Instruction::Add: 258 case Instruction::Mul: 259 return true; 260 case Instruction::Sub: 261 return I.hasNoUnsignedWrap(); 262 default: 263 return false; 264 } 265 } 266 267 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 268 Type *Ty = I.getType(); 269 const DataLayout &DL = Mod->getDataLayout(); 270 int TySize = DL.getTypeSizeInBits(Ty); 271 unsigned Align = I.getAlignment() ? 272 I.getAlignment() : DL.getABITypeAlignment(Ty); 273 274 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 275 } 276 277 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 278 assert(needsPromotionToI32(I.getType()) && 279 "I does not need promotion to i32"); 280 281 if (I.getOpcode() == Instruction::SDiv || 282 I.getOpcode() == Instruction::UDiv || 283 I.getOpcode() == Instruction::SRem || 284 I.getOpcode() == Instruction::URem) 285 return false; 286 287 IRBuilder<> Builder(&I); 288 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 289 290 Type *I32Ty = getI32Ty(Builder, I.getType()); 291 Value *ExtOp0 = nullptr; 292 Value *ExtOp1 = nullptr; 293 Value *ExtRes = nullptr; 294 Value *TruncRes = nullptr; 295 296 if (isSigned(I)) { 297 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 298 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 299 } else { 300 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 301 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 302 } 303 304 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 305 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 306 if (promotedOpIsNSW(cast<Instruction>(I))) 307 Inst->setHasNoSignedWrap(); 308 309 if (promotedOpIsNUW(cast<Instruction>(I))) 310 Inst->setHasNoUnsignedWrap(); 311 312 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 313 Inst->setIsExact(ExactOp->isExact()); 314 } 315 316 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 317 318 I.replaceAllUsesWith(TruncRes); 319 I.eraseFromParent(); 320 321 return true; 322 } 323 324 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 325 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 326 "I does not need promotion to i32"); 327 328 IRBuilder<> Builder(&I); 329 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 330 331 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 332 Value *ExtOp0 = nullptr; 333 Value *ExtOp1 = nullptr; 334 Value *NewICmp = nullptr; 335 336 if (I.isSigned()) { 337 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 338 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 339 } else { 340 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 341 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 342 } 343 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 344 345 I.replaceAllUsesWith(NewICmp); 346 I.eraseFromParent(); 347 348 return true; 349 } 350 351 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 352 assert(needsPromotionToI32(I.getType()) && 353 "I does not need promotion to i32"); 354 355 IRBuilder<> Builder(&I); 356 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 357 358 Type *I32Ty = getI32Ty(Builder, I.getType()); 359 Value *ExtOp1 = nullptr; 360 Value *ExtOp2 = nullptr; 361 Value *ExtRes = nullptr; 362 Value *TruncRes = nullptr; 363 364 if (isSigned(I)) { 365 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 366 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 367 } else { 368 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 369 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 370 } 371 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 372 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 373 374 I.replaceAllUsesWith(TruncRes); 375 I.eraseFromParent(); 376 377 return true; 378 } 379 380 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 381 IntrinsicInst &I) const { 382 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 383 "I must be bitreverse intrinsic"); 384 assert(needsPromotionToI32(I.getType()) && 385 "I does not need promotion to i32"); 386 387 IRBuilder<> Builder(&I); 388 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 389 390 Type *I32Ty = getI32Ty(Builder, I.getType()); 391 Function *I32 = 392 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 393 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 394 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 395 Value *LShrOp = 396 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 397 Value *TruncRes = 398 Builder.CreateTrunc(LShrOp, I.getType()); 399 400 I.replaceAllUsesWith(TruncRes); 401 I.eraseFromParent(); 402 403 return true; 404 } 405 406 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, 407 unsigned ScalarSize) const { 408 KnownBits Known = computeKnownBits(Op, *DL, 0, AC); 409 return ScalarSize - Known.countMinLeadingZeros(); 410 } 411 412 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, 413 unsigned ScalarSize) const { 414 // In order for this to be a signed 24-bit value, bit 23, must 415 // be a sign bit. 416 return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); 417 } 418 419 bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { 420 return ScalarSize >= 24 && // Types less than 24-bit should be treated 421 // as unsigned 24-bit values. 422 numBitsSigned(V, ScalarSize) < 24; 423 } 424 425 bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { 426 return numBitsUnsigned(V, ScalarSize) <= 24; 427 } 428 429 static void extractValues(IRBuilder<> &Builder, 430 SmallVectorImpl<Value *> &Values, Value *V) { 431 VectorType *VT = dyn_cast<VectorType>(V->getType()); 432 if (!VT) { 433 Values.push_back(V); 434 return; 435 } 436 437 for (int I = 0, E = VT->getNumElements(); I != E; ++I) 438 Values.push_back(Builder.CreateExtractElement(V, I)); 439 } 440 441 static Value *insertValues(IRBuilder<> &Builder, 442 Type *Ty, 443 SmallVectorImpl<Value *> &Values) { 444 if (Values.size() == 1) 445 return Values[0]; 446 447 Value *NewVal = UndefValue::get(Ty); 448 for (int I = 0, E = Values.size(); I != E; ++I) 449 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); 450 451 return NewVal; 452 } 453 454 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { 455 if (I.getOpcode() != Instruction::Mul) 456 return false; 457 458 Type *Ty = I.getType(); 459 unsigned Size = Ty->getScalarSizeInBits(); 460 if (Size <= 16 && ST->has16BitInsts()) 461 return false; 462 463 // Prefer scalar if this could be s_mul_i32 464 if (DA->isUniform(&I)) 465 return false; 466 467 Value *LHS = I.getOperand(0); 468 Value *RHS = I.getOperand(1); 469 IRBuilder<> Builder(&I); 470 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 471 472 Intrinsic::ID IntrID = Intrinsic::not_intrinsic; 473 474 // TODO: Should this try to match mulhi24? 475 if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { 476 IntrID = Intrinsic::amdgcn_mul_u24; 477 } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { 478 IntrID = Intrinsic::amdgcn_mul_i24; 479 } else 480 return false; 481 482 SmallVector<Value *, 4> LHSVals; 483 SmallVector<Value *, 4> RHSVals; 484 SmallVector<Value *, 4> ResultVals; 485 extractValues(Builder, LHSVals, LHS); 486 extractValues(Builder, RHSVals, RHS); 487 488 489 IntegerType *I32Ty = Builder.getInt32Ty(); 490 FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); 491 for (int I = 0, E = LHSVals.size(); I != E; ++I) { 492 Value *LHS, *RHS; 493 if (IntrID == Intrinsic::amdgcn_mul_u24) { 494 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); 495 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); 496 } else { 497 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); 498 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); 499 } 500 501 Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); 502 503 if (IntrID == Intrinsic::amdgcn_mul_u24) { 504 ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, 505 LHSVals[I]->getType())); 506 } else { 507 ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, 508 LHSVals[I]->getType())); 509 } 510 } 511 512 I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); 513 I.eraseFromParent(); 514 515 return true; 516 } 517 518 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 519 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 520 if (!CNum) 521 return HasDenormals; 522 523 if (UnsafeDiv) 524 return true; 525 526 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 527 528 // Reciprocal f32 is handled separately without denormals. 529 return HasDenormals ^ IsOne; 530 } 531 532 // Insert an intrinsic for fast fdiv for safe math situations where we can 533 // reduce precision. Leave fdiv for situations where the generic node is 534 // expected to be optimized. 535 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 536 Type *Ty = FDiv.getType(); 537 538 if (!Ty->getScalarType()->isFloatTy()) 539 return false; 540 541 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 542 if (!FPMath) 543 return false; 544 545 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 546 float ULP = FPOp->getFPAccuracy(); 547 if (ULP < 2.5f) 548 return false; 549 550 FastMathFlags FMF = FPOp->getFastMathFlags(); 551 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 552 FMF.allowReciprocal(); 553 554 // With UnsafeDiv node will be optimized to just rcp and mul. 555 if (UnsafeDiv) 556 return false; 557 558 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 559 Builder.setFastMathFlags(FMF); 560 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 561 562 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 563 564 Value *Num = FDiv.getOperand(0); 565 Value *Den = FDiv.getOperand(1); 566 567 Value *NewFDiv = nullptr; 568 569 bool HasDenormals = ST->hasFP32Denormals(); 570 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 571 NewFDiv = UndefValue::get(VT); 572 573 // FIXME: Doesn't do the right thing for cases where the vector is partially 574 // constant. This works when the scalarizer pass is run first. 575 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 576 Value *NumEltI = Builder.CreateExtractElement(Num, I); 577 Value *DenEltI = Builder.CreateExtractElement(Den, I); 578 Value *NewElt; 579 580 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 581 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 582 } else { 583 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 584 } 585 586 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 587 } 588 } else { 589 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 590 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 591 } 592 593 if (NewFDiv) { 594 FDiv.replaceAllUsesWith(NewFDiv); 595 NewFDiv->takeName(&FDiv); 596 FDiv.eraseFromParent(); 597 } 598 599 return !!NewFDiv; 600 } 601 602 static bool hasUnsafeFPMath(const Function &F) { 603 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 604 return Attr.getValueAsString() == "true"; 605 } 606 607 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 608 Value *LHS, Value *RHS) { 609 Type *I32Ty = Builder.getInt32Ty(); 610 Type *I64Ty = Builder.getInt64Ty(); 611 612 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 613 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 614 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 615 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 616 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 617 Hi = Builder.CreateTrunc(Hi, I32Ty); 618 return std::make_pair(Lo, Hi); 619 } 620 621 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 622 return getMul64(Builder, LHS, RHS).second; 623 } 624 625 // The fractional part of a float is enough to accurately represent up to 626 // a 24-bit signed integer. 627 Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 628 BinaryOperator &I, 629 Value *Num, Value *Den, 630 bool IsDiv, bool IsSigned) const { 631 assert(Num->getType()->isIntegerTy(32)); 632 633 const DataLayout &DL = Mod->getDataLayout(); 634 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 635 if (LHSSignBits < 9) 636 return nullptr; 637 638 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 639 if (RHSSignBits < 9) 640 return nullptr; 641 642 643 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 644 unsigned DivBits = 32 - SignBits; 645 if (IsSigned) 646 ++DivBits; 647 648 Type *Ty = Num->getType(); 649 Type *I32Ty = Builder.getInt32Ty(); 650 Type *F32Ty = Builder.getFloatTy(); 651 ConstantInt *One = Builder.getInt32(1); 652 Value *JQ = One; 653 654 if (IsSigned) { 655 // char|short jq = ia ^ ib; 656 JQ = Builder.CreateXor(Num, Den); 657 658 // jq = jq >> (bitsize - 2) 659 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 660 661 // jq = jq | 0x1 662 JQ = Builder.CreateOr(JQ, One); 663 } 664 665 // int ia = (int)LHS; 666 Value *IA = Num; 667 668 // int ib, (int)RHS; 669 Value *IB = Den; 670 671 // float fa = (float)ia; 672 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 673 : Builder.CreateUIToFP(IA, F32Ty); 674 675 // float fb = (float)ib; 676 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 677 : Builder.CreateUIToFP(IB,F32Ty); 678 679 Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); 680 Value *FQM = Builder.CreateFMul(FA, RCP); 681 682 // fq = trunc(fqm); 683 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 684 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 685 686 // float fqneg = -fq; 687 Value *FQNeg = Builder.CreateFNeg(FQ); 688 689 // float fr = mad(fqneg, fb, fa); 690 Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 691 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 692 693 // int iq = (int)fq; 694 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 695 : Builder.CreateFPToUI(FQ, I32Ty); 696 697 // fr = fabs(fr); 698 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 699 700 // fb = fabs(fb); 701 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 702 703 // int cv = fr >= fb; 704 Value *CV = Builder.CreateFCmpOGE(FR, FB); 705 706 // jq = (cv ? jq : 0); 707 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 708 709 // dst = iq + jq; 710 Value *Div = Builder.CreateAdd(IQ, JQ); 711 712 Value *Res = Div; 713 if (!IsDiv) { 714 // Rem needs compensation, it's easier to recompute it 715 Value *Rem = Builder.CreateMul(Div, Den); 716 Res = Builder.CreateSub(Num, Rem); 717 } 718 719 // Truncate to number of bits this divide really is. 720 if (IsSigned) { 721 Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); 722 Res = Builder.CreateSExt(Res, Ty); 723 } else { 724 ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 725 Res = Builder.CreateAnd(Res, TruncMask); 726 } 727 728 return Res; 729 } 730 731 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 732 BinaryOperator &I, 733 Value *Num, Value *Den) const { 734 Instruction::BinaryOps Opc = I.getOpcode(); 735 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 736 Opc == Instruction::SRem || Opc == Instruction::SDiv); 737 738 FastMathFlags FMF; 739 FMF.setFast(); 740 Builder.setFastMathFlags(FMF); 741 742 if (isa<Constant>(Den)) 743 return nullptr; // Keep it for optimization 744 745 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 746 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 747 748 Type *Ty = Num->getType(); 749 Type *I32Ty = Builder.getInt32Ty(); 750 Type *F32Ty = Builder.getFloatTy(); 751 752 if (Ty->getScalarSizeInBits() < 32) { 753 if (IsSigned) { 754 Num = Builder.CreateSExt(Num, I32Ty); 755 Den = Builder.CreateSExt(Den, I32Ty); 756 } else { 757 Num = Builder.CreateZExt(Num, I32Ty); 758 Den = Builder.CreateZExt(Den, I32Ty); 759 } 760 } 761 762 if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 763 Res = Builder.CreateTrunc(Res, Ty); 764 return Res; 765 } 766 767 ConstantInt *Zero = Builder.getInt32(0); 768 ConstantInt *One = Builder.getInt32(1); 769 ConstantInt *MinusOne = Builder.getInt32(~0); 770 771 Value *Sign = nullptr; 772 if (IsSigned) { 773 ConstantInt *K31 = Builder.getInt32(31); 774 Value *LHSign = Builder.CreateAShr(Num, K31); 775 Value *RHSign = Builder.CreateAShr(Den, K31); 776 // Remainder sign is the same as LHS 777 Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 778 779 Num = Builder.CreateAdd(Num, LHSign); 780 Den = Builder.CreateAdd(Den, RHSign); 781 782 Num = Builder.CreateXor(Num, LHSign); 783 Den = Builder.CreateXor(Den, RHSign); 784 } 785 786 // RCP = URECIP(Den) = 2^32 / Den + e 787 // e is rounding error. 788 Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 789 Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); 790 Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 791 Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 792 Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 793 794 // RCP_LO, RCP_HI = mul(RCP, Den) */ 795 Value *RCP_LO, *RCP_HI; 796 std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 797 798 // NEG_RCP_LO = -RCP_LO 799 Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 800 801 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 802 Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 803 Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 804 805 // Calculate the rounding error from the URECIP instruction 806 // E = mulhu(ABS_RCP_LO, RCP) 807 Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 808 809 // RCP_A_E = RCP + E 810 Value *RCP_A_E = Builder.CreateAdd(RCP, E); 811 812 // RCP_S_E = RCP - E 813 Value *RCP_S_E = Builder.CreateSub(RCP, E); 814 815 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 816 Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 817 818 // Quotient = mulhu(Tmp0, Num) 819 Value *Quotient = getMulHu(Builder, Tmp0, Num); 820 821 // Num_S_Remainder = Quotient * Den 822 Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 823 824 // Remainder = Num - Num_S_Remainder 825 Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 826 827 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 828 Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); 829 Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); 830 831 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 832 Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); 833 Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, 834 MinusOne, Zero); 835 836 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 837 Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 838 Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); 839 840 Value *Res; 841 if (IsDiv) { 842 // Quotient_A_One = Quotient + 1 843 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 844 845 // Quotient_S_One = Quotient - 1 846 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 847 848 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 849 Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); 850 851 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 852 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); 853 } else { 854 // Remainder_S_Den = Remainder - Den 855 Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 856 857 // Remainder_A_Den = Remainder + Den 858 Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 859 860 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 861 Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); 862 863 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 864 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); 865 } 866 867 if (IsSigned) { 868 Res = Builder.CreateXor(Res, Sign); 869 Res = Builder.CreateSub(Res, Sign); 870 } 871 872 Res = Builder.CreateTrunc(Res, Ty); 873 874 return Res; 875 } 876 877 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 878 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 879 DA->isUniform(&I) && promoteUniformOpToI32(I)) 880 return true; 881 882 if (replaceMulWithMul24(I)) 883 return true; 884 885 bool Changed = false; 886 Instruction::BinaryOps Opc = I.getOpcode(); 887 Type *Ty = I.getType(); 888 Value *NewDiv = nullptr; 889 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 890 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 891 Ty->getScalarSizeInBits() <= 32) { 892 Value *Num = I.getOperand(0); 893 Value *Den = I.getOperand(1); 894 IRBuilder<> Builder(&I); 895 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 896 897 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 898 NewDiv = UndefValue::get(VT); 899 900 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 901 Value *NumEltN = Builder.CreateExtractElement(Num, N); 902 Value *DenEltN = Builder.CreateExtractElement(Den, N); 903 Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 904 if (!NewElt) 905 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 906 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 907 } 908 } else { 909 NewDiv = expandDivRem32(Builder, I, Num, Den); 910 } 911 912 if (NewDiv) { 913 I.replaceAllUsesWith(NewDiv); 914 I.eraseFromParent(); 915 Changed = true; 916 } 917 } 918 919 return Changed; 920 } 921 922 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 923 if (!WidenLoads) 924 return false; 925 926 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 927 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 928 canWidenScalarExtLoad(I)) { 929 IRBuilder<> Builder(&I); 930 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 931 932 Type *I32Ty = Builder.getInt32Ty(); 933 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 934 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 935 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); 936 WidenLoad->copyMetadata(I); 937 938 // If we have range metadata, we need to convert the type, and not make 939 // assumptions about the high bits. 940 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 941 ConstantInt *Lower = 942 mdconst::extract<ConstantInt>(Range->getOperand(0)); 943 944 if (Lower->getValue().isNullValue()) { 945 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 946 } else { 947 Metadata *LowAndHigh[] = { 948 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 949 // Don't make assumptions about the high bits. 950 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 951 }; 952 953 WidenLoad->setMetadata(LLVMContext::MD_range, 954 MDNode::get(Mod->getContext(), LowAndHigh)); 955 } 956 } 957 958 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 959 Type *IntNTy = Builder.getIntNTy(TySize); 960 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 961 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 962 I.replaceAllUsesWith(ValOrig); 963 I.eraseFromParent(); 964 return true; 965 } 966 967 return false; 968 } 969 970 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 971 bool Changed = false; 972 973 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 974 DA->isUniform(&I)) 975 Changed |= promoteUniformOpToI32(I); 976 977 return Changed; 978 } 979 980 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 981 bool Changed = false; 982 983 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 984 DA->isUniform(&I)) 985 Changed |= promoteUniformOpToI32(I); 986 987 return Changed; 988 } 989 990 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 991 switch (I.getIntrinsicID()) { 992 case Intrinsic::bitreverse: 993 return visitBitreverseIntrinsicInst(I); 994 default: 995 return false; 996 } 997 } 998 999 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 1000 bool Changed = false; 1001 1002 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1003 DA->isUniform(&I)) 1004 Changed |= promoteUniformBitreverseToI32(I); 1005 1006 return Changed; 1007 } 1008 1009 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 1010 Mod = &M; 1011 DL = &Mod->getDataLayout(); 1012 return false; 1013 } 1014 1015 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 1016 if (skipFunction(F)) 1017 return false; 1018 1019 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 1020 if (!TPC) 1021 return false; 1022 1023 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 1024 ST = &TM.getSubtarget<GCNSubtarget>(F); 1025 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1026 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 1027 HasUnsafeFPMath = hasUnsafeFPMath(F); 1028 1029 bool MadeChange = false; 1030 1031 for (BasicBlock &BB : F) { 1032 BasicBlock::iterator Next; 1033 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 1034 Next = std::next(I); 1035 MadeChange |= visit(*I); 1036 } 1037 } 1038 1039 return MadeChange; 1040 } 1041 1042 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 1043 "AMDGPU IR optimizations", false, false) 1044 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1045 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 1046 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 1047 false, false) 1048 1049 char AMDGPUCodeGenPrepare::ID = 0; 1050 1051 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 1052 return new AMDGPUCodeGenPrepare(); 1053 } 1054