1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass lowers all occurrences of i1 values (with a vreg_1 register class)
11 // to lane masks (64-bit scalar registers). The pass assumes machine SSA form
12 // and a wave-level control flow graph.
13 //
14 // Before this pass, values that are semantically i1 and are defined and used
15 // within the same basic block are already represented as lane masks in scalar
16 // registers. However, values that cross basic blocks are always transferred
17 // between basic blocks in vreg_1 virtual registers and are lowered by this
18 // pass.
19 //
20 // The only instructions that use or define vreg_1 virtual registers are COPY,
21 // PHI, and IMPLICIT_DEF.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "AMDGPU.h"
26 #include "AMDGPUSubtarget.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "SIInstrInfo.h"
29 #include "llvm/CodeGen/MachineDominators.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachinePostDominators.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/MachineSSAUpdater.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Target/TargetMachine.h"
39 
40 #define DEBUG_TYPE "si-i1-copies"
41 
42 using namespace llvm;
43 
44 static unsigned createLaneMaskReg(MachineFunction &MF);
45 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
46 
47 namespace {
48 
49 class SILowerI1Copies : public MachineFunctionPass {
50 public:
51   static char ID;
52 
53 private:
54   MachineFunction *MF = nullptr;
55   MachineDominatorTree *DT = nullptr;
56   MachinePostDominatorTree *PDT = nullptr;
57   MachineRegisterInfo *MRI = nullptr;
58   const GCNSubtarget *ST = nullptr;
59   const SIInstrInfo *TII = nullptr;
60 
61   DenseSet<unsigned> ConstrainRegs;
62 
63 public:
SILowerI1Copies()64   SILowerI1Copies() : MachineFunctionPass(ID) {
65     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
66   }
67 
68   bool runOnMachineFunction(MachineFunction &MF) override;
69 
getPassName() const70   StringRef getPassName() const override { return "SI Lower i1 Copies"; }
71 
getAnalysisUsage(AnalysisUsage & AU) const72   void getAnalysisUsage(AnalysisUsage &AU) const override {
73     AU.setPreservesCFG();
74     AU.addRequired<MachineDominatorTree>();
75     AU.addRequired<MachinePostDominatorTree>();
76     MachineFunctionPass::getAnalysisUsage(AU);
77   }
78 
79 private:
80   void lowerCopiesFromI1();
81   void lowerPhis();
82   void lowerCopiesToI1();
83   bool isConstantLaneMask(unsigned Reg, bool &Val) const;
84   void buildMergeLaneMasks(MachineBasicBlock &MBB,
85                            MachineBasicBlock::iterator I, const DebugLoc &DL,
86                            unsigned DstReg, unsigned PrevReg, unsigned CurReg);
87   MachineBasicBlock::iterator
88   getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
89 
isLaneMaskReg(unsigned Reg) const90   bool isLaneMaskReg(unsigned Reg) const {
91     return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
92            TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
93                ST->getWavefrontSize();
94   }
95 };
96 
97 /// Helper class that determines the relationship between incoming values of a
98 /// phi in the control flow graph to determine where an incoming value can
99 /// simply be taken as a scalar lane mask as-is, and where it needs to be
100 /// merged with another, previously defined lane mask.
101 ///
102 /// The approach is as follows:
103 ///  - Determine all basic blocks which, starting from the incoming blocks,
104 ///    a wave may reach before entering the def block (the block containing the
105 ///    phi).
106 ///  - If an incoming block has no predecessors in this set, we can take the
107 ///    incoming value as a scalar lane mask as-is.
108 ///  -- A special case of this is when the def block has a self-loop.
109 ///  - Otherwise, the incoming value needs to be merged with a previously
110 ///    defined lane mask.
111 ///  - If there is a path into the set of reachable blocks that does _not_ go
112 ///    through an incoming block where we can take the scalar lane mask as-is,
113 ///    we need to invent an available value for the SSAUpdater. Choices are
114 ///    0 and undef, with differing consequences for how to merge values etc.
115 ///
116 /// TODO: We could use region analysis to quickly skip over SESE regions during
117 ///       the traversal.
118 ///
119 class PhiIncomingAnalysis {
120   MachinePostDominatorTree &PDT;
121 
122   // For each reachable basic block, whether it is a source in the induced
123   // subgraph of the CFG.
124   DenseMap<MachineBasicBlock *, bool> ReachableMap;
125   SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
126   SmallVector<MachineBasicBlock *, 4> Stack;
127   SmallVector<MachineBasicBlock *, 4> Predecessors;
128 
129 public:
PhiIncomingAnalysis(MachinePostDominatorTree & PDT)130   PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
131 
132   /// Returns whether \p MBB is a source in the induced subgraph of reachable
133   /// blocks.
isSource(MachineBasicBlock & MBB) const134   bool isSource(MachineBasicBlock &MBB) const {
135     return ReachableMap.find(&MBB)->second;
136   }
137 
predecessors() const138   ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
139 
analyze(MachineBasicBlock & DefBlock,ArrayRef<MachineBasicBlock * > IncomingBlocks)140   void analyze(MachineBasicBlock &DefBlock,
141                ArrayRef<MachineBasicBlock *> IncomingBlocks) {
142     assert(Stack.empty());
143     ReachableMap.clear();
144     ReachableOrdered.clear();
145     Predecessors.clear();
146 
147     // Insert the def block first, so that it acts as an end point for the
148     // traversal.
149     ReachableMap.try_emplace(&DefBlock, false);
150     ReachableOrdered.push_back(&DefBlock);
151 
152     for (MachineBasicBlock *MBB : IncomingBlocks) {
153       if (MBB == &DefBlock) {
154         ReachableMap[&DefBlock] = true; // self-loop on DefBlock
155         continue;
156       }
157 
158       ReachableMap.try_emplace(MBB, false);
159       ReachableOrdered.push_back(MBB);
160 
161       // If this block has a divergent terminator and the def block is its
162       // post-dominator, the wave may first visit the other successors.
163       bool Divergent = false;
164       for (MachineInstr &MI : MBB->terminators()) {
165         if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
166             MI.getOpcode() == AMDGPU::SI_IF ||
167             MI.getOpcode() == AMDGPU::SI_ELSE ||
168             MI.getOpcode() == AMDGPU::SI_LOOP) {
169           Divergent = true;
170           break;
171         }
172       }
173 
174       if (Divergent && PDT.dominates(&DefBlock, MBB)) {
175         for (MachineBasicBlock *Succ : MBB->successors())
176           Stack.push_back(Succ);
177       }
178     }
179 
180     while (!Stack.empty()) {
181       MachineBasicBlock *MBB = Stack.pop_back_val();
182       if (!ReachableMap.try_emplace(MBB, false).second)
183         continue;
184       ReachableOrdered.push_back(MBB);
185 
186       for (MachineBasicBlock *Succ : MBB->successors())
187         Stack.push_back(Succ);
188     }
189 
190     for (MachineBasicBlock *MBB : ReachableOrdered) {
191       bool HaveReachablePred = false;
192       for (MachineBasicBlock *Pred : MBB->predecessors()) {
193         if (ReachableMap.count(Pred)) {
194           HaveReachablePred = true;
195         } else {
196           Stack.push_back(Pred);
197         }
198       }
199       if (!HaveReachablePred)
200         ReachableMap[MBB] = true;
201       if (HaveReachablePred) {
202         for (MachineBasicBlock *UnreachablePred : Stack) {
203           if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
204             Predecessors.push_back(UnreachablePred);
205         }
206       }
207       Stack.clear();
208     }
209   }
210 };
211 
212 /// Helper class that detects loops which require us to lower an i1 COPY into
213 /// bitwise manipulation.
214 ///
215 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
216 /// between loops with the same header. Consider this example:
217 ///
218 ///  A-+-+
219 ///  | | |
220 ///  B-+ |
221 ///  |   |
222 ///  C---+
223 ///
224 /// A is the header of a loop containing A, B, and C as far as LoopInfo is
225 /// concerned. However, an i1 COPY in B that is used in C must be lowered to
226 /// bitwise operations to combine results from different loop iterations when
227 /// B has a divergent branch (since by default we will compile this code such
228 /// that threads in a wave are merged at the entry of C).
229 ///
230 /// The following rule is implemented to determine whether bitwise operations
231 /// are required: use the bitwise lowering for a def in block B if a backward
232 /// edge to B is reachable without going through the nearest common
233 /// post-dominator of B and all uses of the def.
234 ///
235 /// TODO: This rule is conservative because it does not check whether the
236 ///       relevant branches are actually divergent.
237 ///
238 /// The class is designed to cache the CFG traversal so that it can be re-used
239 /// for multiple defs within the same basic block.
240 ///
241 /// TODO: We could use region analysis to quickly skip over SESE regions during
242 ///       the traversal.
243 ///
244 class LoopFinder {
245   MachineDominatorTree &DT;
246   MachinePostDominatorTree &PDT;
247 
248   // All visited / reachable block, tagged by level (level 0 is the def block,
249   // level 1 are all blocks reachable including but not going through the def
250   // block's IPDOM, etc.).
251   DenseMap<MachineBasicBlock *, unsigned> Visited;
252 
253   // Nearest common dominator of all visited blocks by level (level 0 is the
254   // def block). Used for seeding the SSAUpdater.
255   SmallVector<MachineBasicBlock *, 4> CommonDominators;
256 
257   // Post-dominator of all visited blocks.
258   MachineBasicBlock *VisitedPostDom = nullptr;
259 
260   // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
261   // reachable without going through the IPDOM of the def block (if the IPDOM
262   // itself has an edge to the def block, the loop level is 2), etc.
263   unsigned FoundLoopLevel = ~0u;
264 
265   MachineBasicBlock *DefBlock = nullptr;
266   SmallVector<MachineBasicBlock *, 4> Stack;
267   SmallVector<MachineBasicBlock *, 4> NextLevel;
268 
269 public:
LoopFinder(MachineDominatorTree & DT,MachinePostDominatorTree & PDT)270   LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
271       : DT(DT), PDT(PDT) {}
272 
initialize(MachineBasicBlock & MBB)273   void initialize(MachineBasicBlock &MBB) {
274     Visited.clear();
275     CommonDominators.clear();
276     Stack.clear();
277     NextLevel.clear();
278     VisitedPostDom = nullptr;
279     FoundLoopLevel = ~0u;
280 
281     DefBlock = &MBB;
282   }
283 
284   /// Check whether a backward edge can be reached without going through the
285   /// given \p PostDom of the def block.
286   ///
287   /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
findLoop(MachineBasicBlock * PostDom)288   unsigned findLoop(MachineBasicBlock *PostDom) {
289     MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
290 
291     if (!VisitedPostDom)
292       advanceLevel();
293 
294     unsigned Level = 0;
295     while (PDNode->getBlock() != PostDom) {
296       if (PDNode->getBlock() == VisitedPostDom)
297         advanceLevel();
298       PDNode = PDNode->getIDom();
299       Level++;
300       if (FoundLoopLevel == Level)
301         return Level;
302     }
303 
304     return 0;
305   }
306 
307   /// Add undef values dominating the loop and the optionally given additional
308   /// blocks, so that the SSA updater doesn't have to search all the way to the
309   /// function entry.
addLoopEntries(unsigned LoopLevel,MachineSSAUpdater & SSAUpdater,ArrayRef<MachineBasicBlock * > Blocks={})310   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
311                       ArrayRef<MachineBasicBlock *> Blocks = {}) {
312     assert(LoopLevel < CommonDominators.size());
313 
314     MachineBasicBlock *Dom = CommonDominators[LoopLevel];
315     for (MachineBasicBlock *MBB : Blocks)
316       Dom = DT.findNearestCommonDominator(Dom, MBB);
317 
318     if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
319       SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
320     } else {
321       // The dominator is part of the loop or the given blocks, so add the
322       // undef value to unreachable predecessors instead.
323       for (MachineBasicBlock *Pred : Dom->predecessors()) {
324         if (!inLoopLevel(*Pred, LoopLevel, Blocks))
325           SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
326       }
327     }
328   }
329 
330 private:
inLoopLevel(MachineBasicBlock & MBB,unsigned LoopLevel,ArrayRef<MachineBasicBlock * > Blocks) const331   bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
332                    ArrayRef<MachineBasicBlock *> Blocks) const {
333     auto DomIt = Visited.find(&MBB);
334     if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
335       return true;
336 
337     if (llvm::find(Blocks, &MBB) != Blocks.end())
338       return true;
339 
340     return false;
341   }
342 
advanceLevel()343   void advanceLevel() {
344     MachineBasicBlock *VisitedDom;
345 
346     if (!VisitedPostDom) {
347       VisitedPostDom = DefBlock;
348       VisitedDom = DefBlock;
349       Stack.push_back(DefBlock);
350     } else {
351       VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
352       VisitedDom = CommonDominators.back();
353 
354       for (unsigned i = 0; i < NextLevel.size();) {
355         if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
356           Stack.push_back(NextLevel[i]);
357 
358           NextLevel[i] = NextLevel.back();
359           NextLevel.pop_back();
360         } else {
361           i++;
362         }
363       }
364     }
365 
366     unsigned Level = CommonDominators.size();
367     while (!Stack.empty()) {
368       MachineBasicBlock *MBB = Stack.pop_back_val();
369       if (!PDT.dominates(VisitedPostDom, MBB))
370         NextLevel.push_back(MBB);
371 
372       Visited[MBB] = Level;
373       VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
374 
375       for (MachineBasicBlock *Succ : MBB->successors()) {
376         if (Succ == DefBlock) {
377           if (MBB == VisitedPostDom)
378             FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
379           else
380             FoundLoopLevel = std::min(FoundLoopLevel, Level);
381           continue;
382         }
383 
384         if (Visited.try_emplace(Succ, ~0u).second) {
385           if (MBB == VisitedPostDom)
386             NextLevel.push_back(Succ);
387           else
388             Stack.push_back(Succ);
389         }
390       }
391     }
392 
393     CommonDominators.push_back(VisitedDom);
394   }
395 };
396 
397 } // End anonymous namespace.
398 
399 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
400                       false)
401 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
402 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
403 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
404                     false)
405 
406 char SILowerI1Copies::ID = 0;
407 
408 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
409 
createSILowerI1CopiesPass()410 FunctionPass *llvm::createSILowerI1CopiesPass() {
411   return new SILowerI1Copies();
412 }
413 
createLaneMaskReg(MachineFunction & MF)414 static unsigned createLaneMaskReg(MachineFunction &MF) {
415   MachineRegisterInfo &MRI = MF.getRegInfo();
416   return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
417 }
418 
insertUndefLaneMask(MachineBasicBlock & MBB)419 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
420   MachineFunction &MF = *MBB.getParent();
421   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
422   const SIInstrInfo *TII = ST.getInstrInfo();
423   unsigned UndefReg = createLaneMaskReg(MF);
424   BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
425           UndefReg);
426   return UndefReg;
427 }
428 
429 /// Lower all instructions that def or use vreg_1 registers.
430 ///
431 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
432 /// occur around inline assembly. We do this first, before vreg_1 registers
433 /// are changed to scalar mask registers.
434 ///
435 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
436 /// all others, because phi lowering looks through copies and can therefore
437 /// often make copy lowering unnecessary.
runOnMachineFunction(MachineFunction & TheMF)438 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
439   MF = &TheMF;
440   MRI = &MF->getRegInfo();
441   DT = &getAnalysis<MachineDominatorTree>();
442   PDT = &getAnalysis<MachinePostDominatorTree>();
443 
444   ST = &MF->getSubtarget<GCNSubtarget>();
445   TII = ST->getInstrInfo();
446 
447   lowerCopiesFromI1();
448   lowerPhis();
449   lowerCopiesToI1();
450 
451   for (unsigned Reg : ConstrainRegs)
452     MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
453   ConstrainRegs.clear();
454 
455   return true;
456 }
457 
lowerCopiesFromI1()458 void SILowerI1Copies::lowerCopiesFromI1() {
459   SmallVector<MachineInstr *, 4> DeadCopies;
460 
461   for (MachineBasicBlock &MBB : *MF) {
462     for (MachineInstr &MI : MBB) {
463       if (MI.getOpcode() != AMDGPU::COPY)
464         continue;
465 
466       unsigned DstReg = MI.getOperand(0).getReg();
467       unsigned SrcReg = MI.getOperand(1).getReg();
468       if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
469           MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
470         continue;
471 
472       if (isLaneMaskReg(DstReg) ||
473           (TargetRegisterInfo::isVirtualRegister(DstReg) &&
474            MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
475         continue;
476 
477       // Copy into a 32-bit vector register.
478       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
479       DebugLoc DL = MI.getDebugLoc();
480 
481       assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
482       assert(!MI.getOperand(0).getSubReg());
483 
484       ConstrainRegs.insert(SrcReg);
485       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
486           .addImm(0)
487           .addImm(-1)
488           .addReg(SrcReg);
489       DeadCopies.push_back(&MI);
490     }
491 
492     for (MachineInstr *MI : DeadCopies)
493       MI->eraseFromParent();
494     DeadCopies.clear();
495   }
496 }
497 
lowerPhis()498 void SILowerI1Copies::lowerPhis() {
499   MachineSSAUpdater SSAUpdater(*MF);
500   LoopFinder LF(*DT, *PDT);
501   PhiIncomingAnalysis PIA(*PDT);
502   SmallVector<MachineInstr *, 4> DeadPhis;
503   SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
504   SmallVector<unsigned, 4> IncomingRegs;
505   SmallVector<unsigned, 4> IncomingUpdated;
506 
507   for (MachineBasicBlock &MBB : *MF) {
508     LF.initialize(MBB);
509 
510     for (MachineInstr &MI : MBB.phis()) {
511       unsigned DstReg = MI.getOperand(0).getReg();
512       if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
513         continue;
514 
515       LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
516 
517       MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
518 
519       // Collect incoming values.
520       for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
521         assert(i + 1 < MI.getNumOperands());
522         unsigned IncomingReg = MI.getOperand(i).getReg();
523         MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
524         MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
525 
526         if (IncomingDef->getOpcode() == AMDGPU::COPY) {
527           IncomingReg = IncomingDef->getOperand(1).getReg();
528           assert(isLaneMaskReg(IncomingReg));
529           assert(!IncomingDef->getOperand(1).getSubReg());
530         } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
531           continue;
532         } else {
533           assert(IncomingDef->isPHI());
534         }
535 
536         IncomingBlocks.push_back(IncomingMBB);
537         IncomingRegs.push_back(IncomingReg);
538       }
539 
540       // Phis in a loop that are observed outside the loop receive a simple but
541       // conservatively correct treatment.
542       MachineBasicBlock *PostDomBound = &MBB;
543       for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
544         PostDomBound =
545             PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
546       }
547 
548       unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
549 
550       SSAUpdater.Initialize(DstReg);
551 
552       if (FoundLoopLevel) {
553         LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
554 
555         for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
556           IncomingUpdated.push_back(createLaneMaskReg(*MF));
557           SSAUpdater.AddAvailableValue(IncomingBlocks[i],
558                                        IncomingUpdated.back());
559         }
560 
561         for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
562           MachineBasicBlock &IMBB = *IncomingBlocks[i];
563           buildMergeLaneMasks(
564               IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
565               SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
566         }
567       } else {
568         // The phi is not observed from outside a loop. Use a more accurate
569         // lowering.
570         PIA.analyze(MBB, IncomingBlocks);
571 
572         for (MachineBasicBlock *MBB : PIA.predecessors())
573           SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
574 
575         for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
576           MachineBasicBlock &IMBB = *IncomingBlocks[i];
577           if (PIA.isSource(IMBB)) {
578             IncomingUpdated.push_back(0);
579             SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
580           } else {
581             IncomingUpdated.push_back(createLaneMaskReg(*MF));
582             SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
583           }
584         }
585 
586         for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
587           if (!IncomingUpdated[i])
588             continue;
589 
590           MachineBasicBlock &IMBB = *IncomingBlocks[i];
591           buildMergeLaneMasks(
592               IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
593               SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
594         }
595       }
596 
597       unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
598       if (NewReg != DstReg) {
599         MRI->replaceRegWith(NewReg, DstReg);
600 
601         // Ensure that DstReg has a single def and mark the old PHI node for
602         // deletion.
603         MI.getOperand(0).setReg(NewReg);
604         DeadPhis.push_back(&MI);
605       }
606 
607       IncomingBlocks.clear();
608       IncomingRegs.clear();
609       IncomingUpdated.clear();
610     }
611 
612     for (MachineInstr *MI : DeadPhis)
613       MI->eraseFromParent();
614     DeadPhis.clear();
615   }
616 }
617 
lowerCopiesToI1()618 void SILowerI1Copies::lowerCopiesToI1() {
619   MachineSSAUpdater SSAUpdater(*MF);
620   LoopFinder LF(*DT, *PDT);
621   SmallVector<MachineInstr *, 4> DeadCopies;
622 
623   for (MachineBasicBlock &MBB : *MF) {
624     LF.initialize(MBB);
625 
626     for (MachineInstr &MI : MBB) {
627       if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
628           MI.getOpcode() != AMDGPU::COPY)
629         continue;
630 
631       unsigned DstReg = MI.getOperand(0).getReg();
632       if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
633           MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
634         continue;
635 
636       if (MRI->use_empty(DstReg)) {
637         DeadCopies.push_back(&MI);
638         continue;
639       }
640 
641       LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
642 
643       MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
644       if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
645         continue;
646 
647       DebugLoc DL = MI.getDebugLoc();
648       unsigned SrcReg = MI.getOperand(1).getReg();
649       assert(!MI.getOperand(1).getSubReg());
650 
651       if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
652           !isLaneMaskReg(SrcReg)) {
653         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
654         unsigned TmpReg = createLaneMaskReg(*MF);
655         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
656             .addReg(SrcReg)
657             .addImm(0);
658         MI.getOperand(1).setReg(TmpReg);
659         SrcReg = TmpReg;
660       }
661 
662       // Defs in a loop that are observed outside the loop must be transformed
663       // into appropriate bit manipulation.
664       MachineBasicBlock *PostDomBound = &MBB;
665       for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
666         PostDomBound =
667             PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
668       }
669 
670       unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
671       if (FoundLoopLevel) {
672         SSAUpdater.Initialize(DstReg);
673         SSAUpdater.AddAvailableValue(&MBB, DstReg);
674         LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
675 
676         buildMergeLaneMasks(MBB, MI, DL, DstReg,
677                             SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
678         DeadCopies.push_back(&MI);
679       }
680     }
681 
682     for (MachineInstr *MI : DeadCopies)
683       MI->eraseFromParent();
684     DeadCopies.clear();
685   }
686 }
687 
isConstantLaneMask(unsigned Reg,bool & Val) const688 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
689   const MachineInstr *MI;
690   for (;;) {
691     MI = MRI->getUniqueVRegDef(Reg);
692     if (MI->getOpcode() != AMDGPU::COPY)
693       break;
694 
695     Reg = MI->getOperand(1).getReg();
696     if (!TargetRegisterInfo::isVirtualRegister(Reg))
697       return false;
698     if (!isLaneMaskReg(Reg))
699       return false;
700   }
701 
702   if (MI->getOpcode() != AMDGPU::S_MOV_B64)
703     return false;
704 
705   if (!MI->getOperand(1).isImm())
706     return false;
707 
708   int64_t Imm = MI->getOperand(1).getImm();
709   if (Imm == 0) {
710     Val = false;
711     return true;
712   }
713   if (Imm == -1) {
714     Val = true;
715     return true;
716   }
717 
718   return false;
719 }
720 
instrDefsUsesSCC(const MachineInstr & MI,bool & Def,bool & Use)721 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
722   Def = false;
723   Use = false;
724 
725   for (const MachineOperand &MO : MI.operands()) {
726     if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
727       if (MO.isUse())
728         Use = true;
729       else
730         Def = true;
731     }
732   }
733 }
734 
735 /// Return a point at the end of the given \p MBB to insert SALU instructions
736 /// for lane mask calculation. Take terminators and SCC into account.
737 MachineBasicBlock::iterator
getSaluInsertionAtEnd(MachineBasicBlock & MBB) const738 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
739   auto InsertionPt = MBB.getFirstTerminator();
740   bool TerminatorsUseSCC = false;
741   for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
742     bool DefsSCC;
743     instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
744     if (TerminatorsUseSCC || DefsSCC)
745       break;
746   }
747 
748   if (!TerminatorsUseSCC)
749     return InsertionPt;
750 
751   while (InsertionPt != MBB.begin()) {
752     InsertionPt--;
753 
754     bool DefSCC, UseSCC;
755     instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
756     if (DefSCC)
757       return InsertionPt;
758   }
759 
760   // We should have at least seen an IMPLICIT_DEF or COPY
761   llvm_unreachable("SCC used by terminator but no def in block");
762 }
763 
buildMergeLaneMasks(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DstReg,unsigned PrevReg,unsigned CurReg)764 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
765                                           MachineBasicBlock::iterator I,
766                                           const DebugLoc &DL, unsigned DstReg,
767                                           unsigned PrevReg, unsigned CurReg) {
768   bool PrevVal;
769   bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
770   bool CurVal;
771   bool CurConstant = isConstantLaneMask(CurReg, CurVal);
772 
773   if (PrevConstant && CurConstant) {
774     if (PrevVal == CurVal) {
775       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
776     } else if (CurVal) {
777       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
778     } else {
779       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
780           .addReg(AMDGPU::EXEC)
781           .addImm(-1);
782     }
783     return;
784   }
785 
786   unsigned PrevMaskedReg = 0;
787   unsigned CurMaskedReg = 0;
788   if (!PrevConstant) {
789     if (CurConstant && CurVal) {
790       PrevMaskedReg = PrevReg;
791     } else {
792       PrevMaskedReg = createLaneMaskReg(*MF);
793       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
794           .addReg(PrevReg)
795           .addReg(AMDGPU::EXEC);
796     }
797   }
798   if (!CurConstant) {
799     // TODO: check whether CurReg is already masked by EXEC
800     if (PrevConstant && PrevVal) {
801       CurMaskedReg = CurReg;
802     } else {
803       CurMaskedReg = createLaneMaskReg(*MF);
804       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
805           .addReg(CurReg)
806           .addReg(AMDGPU::EXEC);
807     }
808   }
809 
810   if (PrevConstant && !PrevVal) {
811     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
812         .addReg(CurMaskedReg);
813   } else if (CurConstant && !CurVal) {
814     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
815         .addReg(PrevMaskedReg);
816   } else if (PrevConstant && PrevVal) {
817     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
818         .addReg(CurMaskedReg)
819         .addReg(AMDGPU::EXEC);
820   } else {
821     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
822         .addReg(PrevMaskedReg)
823         .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
824   }
825 }
826