1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 /// Atomic optimizer uses following strategies to compute scan and reduced
14 /// values
15 /// 1. DPP -
16 ///   This is the most efficient implementation for scan. DPP uses Whole Wave
17 ///   Mode (WWM)
18 /// 2. Iterative -
19 //    An alternative implementation iterates over all active lanes
20 ///   of Wavefront using llvm.cttz and performs scan  using readlane & writelane
21 ///   intrinsics
22 //===----------------------------------------------------------------------===//
23 
24 #include "AMDGPU.h"
25 #include "GCNSubtarget.h"
26 #include "llvm/Analysis/DomTreeUpdater.h"
27 #include "llvm/Analysis/UniformityAnalysis.h"
28 #include "llvm/CodeGen/TargetPassConfig.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/InitializePasses.h"
33 #include "llvm/Target/TargetMachine.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 
36 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 namespace {
42 
43 struct ReplacementInfo {
44   Instruction *I;
45   AtomicRMWInst::BinOp Op;
46   unsigned ValIdx;
47   bool ValDivergent;
48 };
49 
50 class AMDGPUAtomicOptimizer : public FunctionPass {
51 public:
52   static char ID;
53   ScanOptions ScanImpl;
54   AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
55       : FunctionPass(ID), ScanImpl(ScanImpl) {}
56 
57   bool runOnFunction(Function &F) override;
58 
59   void getAnalysisUsage(AnalysisUsage &AU) const override {
60     AU.addPreserved<DominatorTreeWrapperPass>();
61     AU.addRequired<UniformityInfoWrapperPass>();
62     AU.addRequired<TargetPassConfig>();
63   }
64 };
65 
66 class AMDGPUAtomicOptimizerImpl
67     : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
68 private:
69   SmallVector<ReplacementInfo, 8> ToReplace;
70   const UniformityInfo *UA;
71   const DataLayout *DL;
72   DomTreeUpdater &DTU;
73   const GCNSubtarget *ST;
74   bool IsPixelShader;
75   ScanOptions ScanImpl;
76 
77   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
78                         Value *const Identity) const;
79   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
80                    Value *const Identity) const;
81   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
82 
83   std::pair<Value *, Value *>
84   buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
85                        Value *const Identity, Value *V, Instruction &I,
86                        BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;
87 
88   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
89                       bool ValDivergent) const;
90 
91 public:
92   AMDGPUAtomicOptimizerImpl() = delete;
93 
94   AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
95                             DomTreeUpdater &DTU, const GCNSubtarget *ST,
96                             bool IsPixelShader, ScanOptions ScanImpl)
97       : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
98         ScanImpl(ScanImpl) {}
99 
100   bool run(Function &F);
101 
102   void visitAtomicRMWInst(AtomicRMWInst &I);
103   void visitIntrinsicInst(IntrinsicInst &I);
104 };
105 
106 } // namespace
107 
108 char AMDGPUAtomicOptimizer::ID = 0;
109 
110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
111 
112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
113   if (skipFunction(F)) {
114     return false;
115   }
116 
117   const UniformityInfo *UA =
118       &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
119   const DataLayout *DL = &F.getParent()->getDataLayout();
120 
121   DominatorTreeWrapperPass *const DTW =
122       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
123   DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
124                      DomTreeUpdater::UpdateStrategy::Lazy);
125 
126   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
127   const TargetMachine &TM = TPC.getTM<TargetMachine>();
128   const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
129 
130   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
131 
132   return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
133       .run(F);
134 }
135 
136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
137                                                  FunctionAnalysisManager &AM) {
138 
139   const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
140   const DataLayout *DL = &F.getParent()->getDataLayout();
141 
142   DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
143                      DomTreeUpdater::UpdateStrategy::Lazy);
144   const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
145 
146   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
147 
148   bool IsChanged =
149       AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
150           .run(F);
151 
152   if (!IsChanged) {
153     return PreservedAnalyses::all();
154   }
155 
156   PreservedAnalyses PA;
157   PA.preserve<DominatorTreeAnalysis>();
158   return PA;
159 }
160 
161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
162 
163   // Scan option None disables the Pass
164   if (ScanImpl == ScanOptions::None) {
165     return false;
166   }
167 
168   visit(F);
169 
170   const bool Changed = !ToReplace.empty();
171 
172   for (ReplacementInfo &Info : ToReplace) {
173     optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
174   }
175 
176   ToReplace.clear();
177 
178   return Changed;
179 }
180 
181 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
182   // Early exit for unhandled address space atomic instructions.
183   switch (I.getPointerAddressSpace()) {
184   default:
185     return;
186   case AMDGPUAS::GLOBAL_ADDRESS:
187   case AMDGPUAS::LOCAL_ADDRESS:
188     break;
189   }
190 
191   AtomicRMWInst::BinOp Op = I.getOperation();
192 
193   switch (Op) {
194   default:
195     return;
196   case AtomicRMWInst::Add:
197   case AtomicRMWInst::Sub:
198   case AtomicRMWInst::And:
199   case AtomicRMWInst::Or:
200   case AtomicRMWInst::Xor:
201   case AtomicRMWInst::Max:
202   case AtomicRMWInst::Min:
203   case AtomicRMWInst::UMax:
204   case AtomicRMWInst::UMin:
205     break;
206   }
207 
208   const unsigned PtrIdx = 0;
209   const unsigned ValIdx = 1;
210 
211   // If the pointer operand is divergent, then each lane is doing an atomic
212   // operation on a different address, and we cannot optimize that.
213   if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
214     return;
215   }
216 
217   const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
218 
219   // If the value operand is divergent, each lane is contributing a different
220   // value to the atomic calculation. We can only optimize divergent values if
221   // we have DPP available on our subtarget, and the atomic operation is 32
222   // bits.
223   if (ValDivergent &&
224       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
225     return;
226   }
227 
228   // If we get here, we can optimize the atomic using a single wavefront-wide
229   // atomic operation to do the calculation for the entire wavefront, so
230   // remember the instruction so we can come back to it.
231   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
232 
233   ToReplace.push_back(Info);
234 }
235 
236 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
237   AtomicRMWInst::BinOp Op;
238 
239   switch (I.getIntrinsicID()) {
240   default:
241     return;
242   case Intrinsic::amdgcn_buffer_atomic_add:
243   case Intrinsic::amdgcn_struct_buffer_atomic_add:
244   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
245   case Intrinsic::amdgcn_raw_buffer_atomic_add:
246   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
247     Op = AtomicRMWInst::Add;
248     break;
249   case Intrinsic::amdgcn_buffer_atomic_sub:
250   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
251   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
252   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
253   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
254     Op = AtomicRMWInst::Sub;
255     break;
256   case Intrinsic::amdgcn_buffer_atomic_and:
257   case Intrinsic::amdgcn_struct_buffer_atomic_and:
258   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
259   case Intrinsic::amdgcn_raw_buffer_atomic_and:
260   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
261     Op = AtomicRMWInst::And;
262     break;
263   case Intrinsic::amdgcn_buffer_atomic_or:
264   case Intrinsic::amdgcn_struct_buffer_atomic_or:
265   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
266   case Intrinsic::amdgcn_raw_buffer_atomic_or:
267   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
268     Op = AtomicRMWInst::Or;
269     break;
270   case Intrinsic::amdgcn_buffer_atomic_xor:
271   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
272   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
273   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
274   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
275     Op = AtomicRMWInst::Xor;
276     break;
277   case Intrinsic::amdgcn_buffer_atomic_smin:
278   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
279   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
280   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
281   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
282     Op = AtomicRMWInst::Min;
283     break;
284   case Intrinsic::amdgcn_buffer_atomic_umin:
285   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
286   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
287   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
288   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
289     Op = AtomicRMWInst::UMin;
290     break;
291   case Intrinsic::amdgcn_buffer_atomic_smax:
292   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
293   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
294   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
295   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
296     Op = AtomicRMWInst::Max;
297     break;
298   case Intrinsic::amdgcn_buffer_atomic_umax:
299   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
300   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
301   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
302   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
303     Op = AtomicRMWInst::UMax;
304     break;
305   }
306 
307   const unsigned ValIdx = 0;
308 
309   const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
310 
311   // If the value operand is divergent, each lane is contributing a different
312   // value to the atomic calculation. We can only optimize divergent values if
313   // we have DPP available on our subtarget, and the atomic operation is 32
314   // bits.
315   if (ValDivergent &&
316       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
317     return;
318   }
319 
320   // If any of the other arguments to the intrinsic are divergent, we can't
321   // optimize the operation.
322   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
323     if (UA->isDivergentUse(I.getOperandUse(Idx))) {
324       return;
325     }
326   }
327 
328   // If we get here, we can optimize the atomic using a single wavefront-wide
329   // atomic operation to do the calculation for the entire wavefront, so
330   // remember the instruction so we can come back to it.
331   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
332 
333   ToReplace.push_back(Info);
334 }
335 
336 // Use the builder to create the non-atomic counterpart of the specified
337 // atomicrmw binary op.
338 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
339                                   Value *LHS, Value *RHS) {
340   CmpInst::Predicate Pred;
341 
342   switch (Op) {
343   default:
344     llvm_unreachable("Unhandled atomic op");
345   case AtomicRMWInst::Add:
346     return B.CreateBinOp(Instruction::Add, LHS, RHS);
347   case AtomicRMWInst::Sub:
348     return B.CreateBinOp(Instruction::Sub, LHS, RHS);
349   case AtomicRMWInst::And:
350     return B.CreateBinOp(Instruction::And, LHS, RHS);
351   case AtomicRMWInst::Or:
352     return B.CreateBinOp(Instruction::Or, LHS, RHS);
353   case AtomicRMWInst::Xor:
354     return B.CreateBinOp(Instruction::Xor, LHS, RHS);
355 
356   case AtomicRMWInst::Max:
357     Pred = CmpInst::ICMP_SGT;
358     break;
359   case AtomicRMWInst::Min:
360     Pred = CmpInst::ICMP_SLT;
361     break;
362   case AtomicRMWInst::UMax:
363     Pred = CmpInst::ICMP_UGT;
364     break;
365   case AtomicRMWInst::UMin:
366     Pred = CmpInst::ICMP_ULT;
367     break;
368   }
369   Value *Cond = B.CreateICmp(Pred, LHS, RHS);
370   return B.CreateSelect(Cond, LHS, RHS);
371 }
372 
373 // Use the builder to create a reduction of V across the wavefront, with all
374 // lanes active, returning the same result in all lanes.
375 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
376                                                  AtomicRMWInst::BinOp Op,
377                                                  Value *V,
378                                                  Value *const Identity) const {
379   Type *const Ty = V->getType();
380   Module *M = B.GetInsertBlock()->getModule();
381   Function *UpdateDPP =
382       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
383 
384   // Reduce within each row of 16 lanes.
385   for (unsigned Idx = 0; Idx < 4; Idx++) {
386     V = buildNonAtomicBinOp(
387         B, Op, V,
388         B.CreateCall(UpdateDPP,
389                      {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
390                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
391   }
392 
393   // Reduce within each pair of rows (i.e. 32 lanes).
394   assert(ST->hasPermLaneX16());
395   V = buildNonAtomicBinOp(
396       B, Op, V,
397       B.CreateIntrinsic(
398           Intrinsic::amdgcn_permlanex16, {},
399           {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
400 
401   if (ST->isWave32())
402     return V;
403 
404   if (ST->hasPermLane64()) {
405     // Reduce across the upper and lower 32 lanes.
406     return buildNonAtomicBinOp(
407         B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
408   }
409 
410   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
411   // combine them with a scalar operation.
412   Function *ReadLane =
413       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
414   Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
415   Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
416   return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
417 }
418 
419 // Use the builder to create an inclusive scan of V across the wavefront, with
420 // all lanes active.
421 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
422                                             AtomicRMWInst::BinOp Op, Value *V,
423                                             Value *const Identity) const {
424   Type *const Ty = V->getType();
425   Module *M = B.GetInsertBlock()->getModule();
426   Function *UpdateDPP =
427       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
428 
429   for (unsigned Idx = 0; Idx < 4; Idx++) {
430     V = buildNonAtomicBinOp(
431         B, Op, V,
432         B.CreateCall(UpdateDPP,
433                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
434                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
435   }
436   if (ST->hasDPPBroadcasts()) {
437     // GFX9 has DPP row broadcast operations.
438     V = buildNonAtomicBinOp(
439         B, Op, V,
440         B.CreateCall(UpdateDPP,
441                      {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
442                       B.getInt32(0xf), B.getFalse()}));
443     V = buildNonAtomicBinOp(
444         B, Op, V,
445         B.CreateCall(UpdateDPP,
446                      {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
447                       B.getInt32(0xf), B.getFalse()}));
448   } else {
449     // On GFX10 all DPP operations are confined to a single row. To get cross-
450     // row operations we have to use permlane or readlane.
451 
452     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
453     // 48..63).
454     assert(ST->hasPermLaneX16());
455     Value *const PermX = B.CreateIntrinsic(
456         Intrinsic::amdgcn_permlanex16, {},
457         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
458     V = buildNonAtomicBinOp(
459         B, Op, V,
460         B.CreateCall(UpdateDPP,
461                      {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
462                       B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
463     if (!ST->isWave32()) {
464       // Combine lane 31 into lanes 32..63.
465       Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
466                                               {V, B.getInt32(31)});
467       V = buildNonAtomicBinOp(
468           B, Op, V,
469           B.CreateCall(UpdateDPP,
470                        {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
471                         B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
472     }
473   }
474   return V;
475 }
476 
477 // Use the builder to create a shift right of V across the wavefront, with all
478 // lanes active, to turn an inclusive scan into an exclusive scan.
479 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
480                                                   Value *const Identity) const {
481   Type *const Ty = V->getType();
482   Module *M = B.GetInsertBlock()->getModule();
483   Function *UpdateDPP =
484       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
485 
486   if (ST->hasDPPWavefrontShifts()) {
487     // GFX9 has DPP wavefront shift operations.
488     V = B.CreateCall(UpdateDPP,
489                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
490                       B.getInt32(0xf), B.getFalse()});
491   } else {
492     Function *ReadLane =
493         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
494     Function *WriteLane =
495         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
496 
497     // On GFX10 all DPP operations are confined to a single row. To get cross-
498     // row operations we have to use permlane or readlane.
499     Value *Old = V;
500     V = B.CreateCall(UpdateDPP,
501                      {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
502                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
503 
504     // Copy the old lane 15 to the new lane 16.
505     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
506                                  B.getInt32(16), V});
507 
508     if (!ST->isWave32()) {
509       // Copy the old lane 31 to the new lane 32.
510       V = B.CreateCall(
511           WriteLane,
512           {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
513 
514       // Copy the old lane 47 to the new lane 48.
515       V = B.CreateCall(
516           WriteLane,
517           {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
518     }
519   }
520 
521   return V;
522 }
523 
524 // Use the builder to create an exclusive scan and compute the final reduced
525 // value using an iterative approach. This provides an alternative
526 // implementation to DPP which uses WMM for scan computations. This API iterate
527 // over active lanes to read, compute and update the value using
528 // readlane and writelane intrinsics.
529 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
530     IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
531     Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
532 
533   auto *Ty = I.getType();
534   auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
535   auto *EntryBB = I.getParent();
536   auto NeedResult = !I.use_empty();
537 
538   auto *Ballot =
539       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
540 
541   // Start inserting instructions for ComputeLoop block
542   B.SetInsertPoint(ComputeLoop);
543   // Phi nodes for Accumulator, Scan results destination, and Active Lanes
544   auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");
545   Accumulator->addIncoming(Identity, EntryBB);
546   PHINode *OldValuePhi = nullptr;
547   if (NeedResult) {
548     OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");
549     OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);
550   }
551   auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");
552   ActiveBits->addIncoming(Ballot, EntryBB);
553 
554   // Use llvm.cttz instrinsic to find the lowest remaining active lane.
555   auto *FF1 =
556       B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
557   auto *LaneIdxInt = B.CreateTrunc(FF1, Ty);
558 
559   // Get the value required for atomic operation
560   auto *LaneValue =
561       B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
562 
563   // Perform writelane if intermediate scan results are required later in the
564   // kernel computations
565   Value *OldValue = nullptr;
566   if (NeedResult) {
567     OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
568                                  {Accumulator, LaneIdxInt, OldValuePhi});
569     OldValuePhi->addIncoming(OldValue, ComputeLoop);
570   }
571 
572   // Accumulate the results
573   auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);
574   Accumulator->addIncoming(NewAccumulator, ComputeLoop);
575 
576   // Set bit to zero of current active lane so that for next iteration llvm.cttz
577   // return the next active lane
578   auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
579 
580   auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
581   auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
582   ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
583 
584   // Branch out of the loop when all lanes are processed.
585   auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));
586   B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
587 
588   B.SetInsertPoint(ComputeEnd);
589 
590   return {OldValue, NewAccumulator};
591 }
592 
593 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
594                                          unsigned BitWidth) {
595   switch (Op) {
596   default:
597     llvm_unreachable("Unhandled atomic op");
598   case AtomicRMWInst::Add:
599   case AtomicRMWInst::Sub:
600   case AtomicRMWInst::Or:
601   case AtomicRMWInst::Xor:
602   case AtomicRMWInst::UMax:
603     return APInt::getMinValue(BitWidth);
604   case AtomicRMWInst::And:
605   case AtomicRMWInst::UMin:
606     return APInt::getMaxValue(BitWidth);
607   case AtomicRMWInst::Max:
608     return APInt::getSignedMinValue(BitWidth);
609   case AtomicRMWInst::Min:
610     return APInt::getSignedMaxValue(BitWidth);
611   }
612 }
613 
614 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
615   const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
616   return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
617 }
618 
619 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
620                                                AtomicRMWInst::BinOp Op,
621                                                unsigned ValIdx,
622                                                bool ValDivergent) const {
623   // Start building just before the instruction.
624   IRBuilder<> B(&I);
625 
626   // If we are in a pixel shader, because of how we have to mask out helper
627   // lane invocations, we need to record the entry and exit BB's.
628   BasicBlock *PixelEntryBB = nullptr;
629   BasicBlock *PixelExitBB = nullptr;
630 
631   // If we're optimizing an atomic within a pixel shader, we need to wrap the
632   // entire atomic operation in a helper-lane check. We do not want any helper
633   // lanes that are around only for the purposes of derivatives to take part
634   // in any cross-lane communication, and we use a branch on whether the lane is
635   // live to do this.
636   if (IsPixelShader) {
637     // Record I's original position as the entry block.
638     PixelEntryBB = I.getParent();
639 
640     Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
641     Instruction *const NonHelperTerminator =
642         SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
643 
644     // Record I's new position as the exit block.
645     PixelExitBB = I.getParent();
646 
647     I.moveBefore(NonHelperTerminator);
648     B.SetInsertPoint(&I);
649   }
650 
651   Type *const Ty = I.getType();
652   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
653   auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
654 
655   // This is the value in the atomic operation we need to combine in order to
656   // reduce the number of atomic operations.
657   Value *const V = I.getOperand(ValIdx);
658 
659   // We need to know how many lanes are active within the wavefront, and we do
660   // this by doing a ballot of active lanes.
661   Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
662   CallInst *const Ballot =
663       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
664 
665   // We need to know how many lanes are active within the wavefront that are
666   // below us. If we counted each lane linearly starting from 0, a lane is
667   // below us only if its associated index was less than ours. We do this by
668   // using the mbcnt intrinsic.
669   Value *Mbcnt;
670   if (ST->isWave32()) {
671     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
672                               {Ballot, B.getInt32(0)});
673   } else {
674     Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
675     Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
676     Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
677     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
678                               {ExtractLo, B.getInt32(0)});
679     Mbcnt =
680         B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
681   }
682   Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
683 
684   Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
685 
686   Value *ExclScan = nullptr;
687   Value *NewV = nullptr;
688 
689   const bool NeedResult = !I.use_empty();
690 
691   Function *F = I.getFunction();
692   LLVMContext &C = F->getContext();
693   BasicBlock *ComputeLoop = nullptr;
694   BasicBlock *ComputeEnd = nullptr;
695   // If we have a divergent value in each lane, we need to combine the value
696   // using DPP.
697   if (ValDivergent) {
698     const AtomicRMWInst::BinOp ScanOp =
699         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
700     if (ScanImpl == ScanOptions::DPP) {
701       // First we need to set all inactive invocations to the identity value, so
702       // that they can correctly contribute to the final result.
703       NewV =
704           B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
705       const AtomicRMWInst::BinOp ScanOp =
706           Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
707       if (!NeedResult && ST->hasPermLaneX16()) {
708         // On GFX10 the permlanex16 instruction helps us build a reduction
709         // without too many readlanes and writelanes, which are generally bad
710         // for performance.
711         NewV = buildReduction(B, ScanOp, NewV, Identity);
712       } else {
713         NewV = buildScan(B, ScanOp, NewV, Identity);
714         if (NeedResult)
715           ExclScan = buildShiftRight(B, NewV, Identity);
716         // Read the value from the last lane, which has accumulated the values
717         // of each active lane in the wavefront. This will be our new value
718         // which we will provide to the atomic operation.
719         Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
720         assert(TyBitWidth == 32);
721         NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
722                                  {NewV, LastLaneIdx});
723       }
724       // Finally mark the readlanes in the WWM section.
725       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
726     } else if (ScanImpl == ScanOptions::Iterative) {
727       // Alternative implementation for scan
728       ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
729       ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
730       std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
731                                                       ComputeLoop, ComputeEnd);
732     } else {
733       llvm_unreachable("Atomic Optimzer is disabled for None strategy");
734     }
735   } else {
736     switch (Op) {
737     default:
738       llvm_unreachable("Unhandled atomic op");
739 
740     case AtomicRMWInst::Add:
741     case AtomicRMWInst::Sub: {
742       // The new value we will be contributing to the atomic operation is the
743       // old value times the number of active lanes.
744       Value *const Ctpop = B.CreateIntCast(
745           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
746       NewV = buildMul(B, V, Ctpop);
747       break;
748     }
749 
750     case AtomicRMWInst::And:
751     case AtomicRMWInst::Or:
752     case AtomicRMWInst::Max:
753     case AtomicRMWInst::Min:
754     case AtomicRMWInst::UMax:
755     case AtomicRMWInst::UMin:
756       // These operations with a uniform value are idempotent: doing the atomic
757       // operation multiple times has the same effect as doing it once.
758       NewV = V;
759       break;
760 
761     case AtomicRMWInst::Xor:
762       // The new value we will be contributing to the atomic operation is the
763       // old value times the parity of the number of active lanes.
764       Value *const Ctpop = B.CreateIntCast(
765           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
766       NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
767       break;
768     }
769   }
770 
771   // We only want a single lane to enter our new control flow, and we do this
772   // by checking if there are any active lanes below us. Only one lane will
773   // have 0 active lanes below us, so that will be the only one to progress.
774   Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
775 
776   // Store I's original basic block before we split the block.
777   BasicBlock *const EntryBB = I.getParent();
778 
779   // We need to introduce some new control flow to force a single lane to be
780   // active. We do this by splitting I's basic block at I, and introducing the
781   // new block such that:
782   // entry --> single_lane -\
783   //       \------------------> exit
784   Instruction *const SingleLaneTerminator =
785       SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
786 
787   // At this point, we have split the I's block to allow one lane in wavefront
788   // to update the precomputed reduced value. Also, completed the codegen for
789   // new control flow i.e. iterative loop which perform reduction and scan using
790   // ComputeLoop and ComputeEnd.
791   // For the new control flow, we need to move branch instruction i.e.
792   // terminator created during SplitBlockAndInsertIfThen from I's block to
793   // ComputeEnd block. We also need to set up predecessor to next block when
794   // single lane done updating the final reduced value.
795   BasicBlock *Predecessor = nullptr;
796   if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
797     // Move terminator from I's block to ComputeEnd block.
798     Instruction *Terminator = EntryBB->getTerminator();
799     B.SetInsertPoint(ComputeEnd);
800     Terminator->removeFromParent();
801     B.Insert(Terminator);
802 
803     // Branch to ComputeLoop Block unconditionally from the I's block for
804     // iterative approach.
805     B.SetInsertPoint(EntryBB);
806     B.CreateBr(ComputeLoop);
807 
808     // Update the dominator tree for new control flow.
809     DTU.applyUpdates(
810         {{DominatorTree::Insert, EntryBB, ComputeLoop},
811          {DominatorTree::Insert, ComputeLoop, ComputeEnd},
812          {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}});
813 
814     Predecessor = ComputeEnd;
815   } else {
816     Predecessor = EntryBB;
817   }
818   // Move the IR builder into single_lane next.
819   B.SetInsertPoint(SingleLaneTerminator);
820 
821   // Clone the original atomic operation into single lane, replacing the
822   // original value with our newly created one.
823   Instruction *const NewI = I.clone();
824   B.Insert(NewI);
825   NewI->setOperand(ValIdx, NewV);
826 
827   // Move the IR builder into exit next, and start inserting just before the
828   // original instruction.
829   B.SetInsertPoint(&I);
830 
831   if (NeedResult) {
832     // Create a PHI node to get our new atomic result into the exit block.
833     PHINode *const PHI = B.CreatePHI(Ty, 2);
834     PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
835     PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
836 
837     // We need to broadcast the value who was the lowest active lane (the first
838     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
839     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
840     Value *BroadcastI = nullptr;
841 
842     if (TyBitWidth == 64) {
843       Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
844       Value *const ExtractHi =
845           B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
846       CallInst *const ReadFirstLaneLo =
847           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
848       CallInst *const ReadFirstLaneHi =
849           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
850       Value *const PartialInsert = B.CreateInsertElement(
851           PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
852       Value *const Insert =
853           B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
854       BroadcastI = B.CreateBitCast(Insert, Ty);
855     } else if (TyBitWidth == 32) {
856 
857       BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
858     } else {
859       llvm_unreachable("Unhandled atomic bit width");
860     }
861 
862     // Now that we have the result of our single atomic operation, we need to
863     // get our individual lane's slice into the result. We use the lane offset
864     // we previously calculated combined with the atomic result value we got
865     // from the first lane, to get our lane's index into the atomic result.
866     Value *LaneOffset = nullptr;
867     if (ValDivergent) {
868       if (ScanImpl == ScanOptions::DPP) {
869         LaneOffset =
870             B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
871       } else if (ScanImpl == ScanOptions::Iterative) {
872         LaneOffset = ExclScan;
873       } else {
874         llvm_unreachable("Atomic Optimzer is disabled for None strategy");
875       }
876     } else {
877       switch (Op) {
878       default:
879         llvm_unreachable("Unhandled atomic op");
880       case AtomicRMWInst::Add:
881       case AtomicRMWInst::Sub:
882         LaneOffset = buildMul(B, V, Mbcnt);
883         break;
884       case AtomicRMWInst::And:
885       case AtomicRMWInst::Or:
886       case AtomicRMWInst::Max:
887       case AtomicRMWInst::Min:
888       case AtomicRMWInst::UMax:
889       case AtomicRMWInst::UMin:
890         LaneOffset = B.CreateSelect(Cond, Identity, V);
891         break;
892       case AtomicRMWInst::Xor:
893         LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
894         break;
895       }
896     }
897     Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
898 
899     if (IsPixelShader) {
900       // Need a final PHI to reconverge to above the helper lane branch mask.
901       B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
902 
903       PHINode *const PHI = B.CreatePHI(Ty, 2);
904       PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
905       PHI->addIncoming(Result, I.getParent());
906       I.replaceAllUsesWith(PHI);
907     } else {
908       // Replace the original atomic instruction with the new one.
909       I.replaceAllUsesWith(Result);
910     }
911   }
912 
913   // And delete the original.
914   I.eraseFromParent();
915 }
916 
917 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
918                       "AMDGPU atomic optimizations", false, false)
919 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
920 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
921 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
922                     "AMDGPU atomic optimizations", false, false)
923 
924 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {
925   return new AMDGPUAtomicOptimizer(ScanStrategy);
926 }
927