1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass lowers all occurrences of i1 values (with a vreg_1 register class)
11 // to lane masks (64-bit scalar registers). The pass assumes machine SSA form
12 // and a wave-level control flow graph.
13 //
14 // Before this pass, values that are semantically i1 and are defined and used
15 // within the same basic block are already represented as lane masks in scalar
16 // registers. However, values that cross basic blocks are always transferred
17 // between basic blocks in vreg_1 virtual registers and are lowered by this
18 // pass.
19 //
20 // The only instructions that use or define vreg_1 virtual registers are COPY,
21 // PHI, and IMPLICIT_DEF.
22 //
23 //===----------------------------------------------------------------------===//
24
25 #include "AMDGPU.h"
26 #include "AMDGPUSubtarget.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "SIInstrInfo.h"
29 #include "llvm/CodeGen/MachineDominators.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachinePostDominators.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/MachineSSAUpdater.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Target/TargetMachine.h"
39
40 #define DEBUG_TYPE "si-i1-copies"
41
42 using namespace llvm;
43
44 static unsigned createLaneMaskReg(MachineFunction &MF);
45 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
46
47 namespace {
48
49 class SILowerI1Copies : public MachineFunctionPass {
50 public:
51 static char ID;
52
53 private:
54 MachineFunction *MF = nullptr;
55 MachineDominatorTree *DT = nullptr;
56 MachinePostDominatorTree *PDT = nullptr;
57 MachineRegisterInfo *MRI = nullptr;
58 const GCNSubtarget *ST = nullptr;
59 const SIInstrInfo *TII = nullptr;
60
61 DenseSet<unsigned> ConstrainRegs;
62
63 public:
SILowerI1Copies()64 SILowerI1Copies() : MachineFunctionPass(ID) {
65 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
66 }
67
68 bool runOnMachineFunction(MachineFunction &MF) override;
69
getPassName() const70 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
71
getAnalysisUsage(AnalysisUsage & AU) const72 void getAnalysisUsage(AnalysisUsage &AU) const override {
73 AU.setPreservesCFG();
74 AU.addRequired<MachineDominatorTree>();
75 AU.addRequired<MachinePostDominatorTree>();
76 MachineFunctionPass::getAnalysisUsage(AU);
77 }
78
79 private:
80 void lowerCopiesFromI1();
81 void lowerPhis();
82 void lowerCopiesToI1();
83 bool isConstantLaneMask(unsigned Reg, bool &Val) const;
84 void buildMergeLaneMasks(MachineBasicBlock &MBB,
85 MachineBasicBlock::iterator I, const DebugLoc &DL,
86 unsigned DstReg, unsigned PrevReg, unsigned CurReg);
87 MachineBasicBlock::iterator
88 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
89
isLaneMaskReg(unsigned Reg) const90 bool isLaneMaskReg(unsigned Reg) const {
91 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
92 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
93 ST->getWavefrontSize();
94 }
95 };
96
97 /// Helper class that determines the relationship between incoming values of a
98 /// phi in the control flow graph to determine where an incoming value can
99 /// simply be taken as a scalar lane mask as-is, and where it needs to be
100 /// merged with another, previously defined lane mask.
101 ///
102 /// The approach is as follows:
103 /// - Determine all basic blocks which, starting from the incoming blocks,
104 /// a wave may reach before entering the def block (the block containing the
105 /// phi).
106 /// - If an incoming block has no predecessors in this set, we can take the
107 /// incoming value as a scalar lane mask as-is.
108 /// -- A special case of this is when the def block has a self-loop.
109 /// - Otherwise, the incoming value needs to be merged with a previously
110 /// defined lane mask.
111 /// - If there is a path into the set of reachable blocks that does _not_ go
112 /// through an incoming block where we can take the scalar lane mask as-is,
113 /// we need to invent an available value for the SSAUpdater. Choices are
114 /// 0 and undef, with differing consequences for how to merge values etc.
115 ///
116 /// TODO: We could use region analysis to quickly skip over SESE regions during
117 /// the traversal.
118 ///
119 class PhiIncomingAnalysis {
120 MachinePostDominatorTree &PDT;
121
122 // For each reachable basic block, whether it is a source in the induced
123 // subgraph of the CFG.
124 DenseMap<MachineBasicBlock *, bool> ReachableMap;
125 SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
126 SmallVector<MachineBasicBlock *, 4> Stack;
127 SmallVector<MachineBasicBlock *, 4> Predecessors;
128
129 public:
PhiIncomingAnalysis(MachinePostDominatorTree & PDT)130 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
131
132 /// Returns whether \p MBB is a source in the induced subgraph of reachable
133 /// blocks.
isSource(MachineBasicBlock & MBB) const134 bool isSource(MachineBasicBlock &MBB) const {
135 return ReachableMap.find(&MBB)->second;
136 }
137
predecessors() const138 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
139
analyze(MachineBasicBlock & DefBlock,ArrayRef<MachineBasicBlock * > IncomingBlocks)140 void analyze(MachineBasicBlock &DefBlock,
141 ArrayRef<MachineBasicBlock *> IncomingBlocks) {
142 assert(Stack.empty());
143 ReachableMap.clear();
144 ReachableOrdered.clear();
145 Predecessors.clear();
146
147 // Insert the def block first, so that it acts as an end point for the
148 // traversal.
149 ReachableMap.try_emplace(&DefBlock, false);
150 ReachableOrdered.push_back(&DefBlock);
151
152 for (MachineBasicBlock *MBB : IncomingBlocks) {
153 if (MBB == &DefBlock) {
154 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
155 continue;
156 }
157
158 ReachableMap.try_emplace(MBB, false);
159 ReachableOrdered.push_back(MBB);
160
161 // If this block has a divergent terminator and the def block is its
162 // post-dominator, the wave may first visit the other successors.
163 bool Divergent = false;
164 for (MachineInstr &MI : MBB->terminators()) {
165 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
166 MI.getOpcode() == AMDGPU::SI_IF ||
167 MI.getOpcode() == AMDGPU::SI_ELSE ||
168 MI.getOpcode() == AMDGPU::SI_LOOP) {
169 Divergent = true;
170 break;
171 }
172 }
173
174 if (Divergent && PDT.dominates(&DefBlock, MBB)) {
175 for (MachineBasicBlock *Succ : MBB->successors())
176 Stack.push_back(Succ);
177 }
178 }
179
180 while (!Stack.empty()) {
181 MachineBasicBlock *MBB = Stack.pop_back_val();
182 if (!ReachableMap.try_emplace(MBB, false).second)
183 continue;
184 ReachableOrdered.push_back(MBB);
185
186 for (MachineBasicBlock *Succ : MBB->successors())
187 Stack.push_back(Succ);
188 }
189
190 for (MachineBasicBlock *MBB : ReachableOrdered) {
191 bool HaveReachablePred = false;
192 for (MachineBasicBlock *Pred : MBB->predecessors()) {
193 if (ReachableMap.count(Pred)) {
194 HaveReachablePred = true;
195 } else {
196 Stack.push_back(Pred);
197 }
198 }
199 if (!HaveReachablePred)
200 ReachableMap[MBB] = true;
201 if (HaveReachablePred) {
202 for (MachineBasicBlock *UnreachablePred : Stack) {
203 if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
204 Predecessors.push_back(UnreachablePred);
205 }
206 }
207 Stack.clear();
208 }
209 }
210 };
211
212 /// Helper class that detects loops which require us to lower an i1 COPY into
213 /// bitwise manipulation.
214 ///
215 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
216 /// between loops with the same header. Consider this example:
217 ///
218 /// A-+-+
219 /// | | |
220 /// B-+ |
221 /// | |
222 /// C---+
223 ///
224 /// A is the header of a loop containing A, B, and C as far as LoopInfo is
225 /// concerned. However, an i1 COPY in B that is used in C must be lowered to
226 /// bitwise operations to combine results from different loop iterations when
227 /// B has a divergent branch (since by default we will compile this code such
228 /// that threads in a wave are merged at the entry of C).
229 ///
230 /// The following rule is implemented to determine whether bitwise operations
231 /// are required: use the bitwise lowering for a def in block B if a backward
232 /// edge to B is reachable without going through the nearest common
233 /// post-dominator of B and all uses of the def.
234 ///
235 /// TODO: This rule is conservative because it does not check whether the
236 /// relevant branches are actually divergent.
237 ///
238 /// The class is designed to cache the CFG traversal so that it can be re-used
239 /// for multiple defs within the same basic block.
240 ///
241 /// TODO: We could use region analysis to quickly skip over SESE regions during
242 /// the traversal.
243 ///
244 class LoopFinder {
245 MachineDominatorTree &DT;
246 MachinePostDominatorTree &PDT;
247
248 // All visited / reachable block, tagged by level (level 0 is the def block,
249 // level 1 are all blocks reachable including but not going through the def
250 // block's IPDOM, etc.).
251 DenseMap<MachineBasicBlock *, unsigned> Visited;
252
253 // Nearest common dominator of all visited blocks by level (level 0 is the
254 // def block). Used for seeding the SSAUpdater.
255 SmallVector<MachineBasicBlock *, 4> CommonDominators;
256
257 // Post-dominator of all visited blocks.
258 MachineBasicBlock *VisitedPostDom = nullptr;
259
260 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
261 // reachable without going through the IPDOM of the def block (if the IPDOM
262 // itself has an edge to the def block, the loop level is 2), etc.
263 unsigned FoundLoopLevel = ~0u;
264
265 MachineBasicBlock *DefBlock = nullptr;
266 SmallVector<MachineBasicBlock *, 4> Stack;
267 SmallVector<MachineBasicBlock *, 4> NextLevel;
268
269 public:
LoopFinder(MachineDominatorTree & DT,MachinePostDominatorTree & PDT)270 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
271 : DT(DT), PDT(PDT) {}
272
initialize(MachineBasicBlock & MBB)273 void initialize(MachineBasicBlock &MBB) {
274 Visited.clear();
275 CommonDominators.clear();
276 Stack.clear();
277 NextLevel.clear();
278 VisitedPostDom = nullptr;
279 FoundLoopLevel = ~0u;
280
281 DefBlock = &MBB;
282 }
283
284 /// Check whether a backward edge can be reached without going through the
285 /// given \p PostDom of the def block.
286 ///
287 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
findLoop(MachineBasicBlock * PostDom)288 unsigned findLoop(MachineBasicBlock *PostDom) {
289 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
290
291 if (!VisitedPostDom)
292 advanceLevel();
293
294 unsigned Level = 0;
295 while (PDNode->getBlock() != PostDom) {
296 if (PDNode->getBlock() == VisitedPostDom)
297 advanceLevel();
298 PDNode = PDNode->getIDom();
299 Level++;
300 if (FoundLoopLevel == Level)
301 return Level;
302 }
303
304 return 0;
305 }
306
307 /// Add undef values dominating the loop and the optionally given additional
308 /// blocks, so that the SSA updater doesn't have to search all the way to the
309 /// function entry.
addLoopEntries(unsigned LoopLevel,MachineSSAUpdater & SSAUpdater,ArrayRef<MachineBasicBlock * > Blocks={})310 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
311 ArrayRef<MachineBasicBlock *> Blocks = {}) {
312 assert(LoopLevel < CommonDominators.size());
313
314 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
315 for (MachineBasicBlock *MBB : Blocks)
316 Dom = DT.findNearestCommonDominator(Dom, MBB);
317
318 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
319 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
320 } else {
321 // The dominator is part of the loop or the given blocks, so add the
322 // undef value to unreachable predecessors instead.
323 for (MachineBasicBlock *Pred : Dom->predecessors()) {
324 if (!inLoopLevel(*Pred, LoopLevel, Blocks))
325 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
326 }
327 }
328 }
329
330 private:
inLoopLevel(MachineBasicBlock & MBB,unsigned LoopLevel,ArrayRef<MachineBasicBlock * > Blocks) const331 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
332 ArrayRef<MachineBasicBlock *> Blocks) const {
333 auto DomIt = Visited.find(&MBB);
334 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
335 return true;
336
337 if (llvm::find(Blocks, &MBB) != Blocks.end())
338 return true;
339
340 return false;
341 }
342
advanceLevel()343 void advanceLevel() {
344 MachineBasicBlock *VisitedDom;
345
346 if (!VisitedPostDom) {
347 VisitedPostDom = DefBlock;
348 VisitedDom = DefBlock;
349 Stack.push_back(DefBlock);
350 } else {
351 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
352 VisitedDom = CommonDominators.back();
353
354 for (unsigned i = 0; i < NextLevel.size();) {
355 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
356 Stack.push_back(NextLevel[i]);
357
358 NextLevel[i] = NextLevel.back();
359 NextLevel.pop_back();
360 } else {
361 i++;
362 }
363 }
364 }
365
366 unsigned Level = CommonDominators.size();
367 while (!Stack.empty()) {
368 MachineBasicBlock *MBB = Stack.pop_back_val();
369 if (!PDT.dominates(VisitedPostDom, MBB))
370 NextLevel.push_back(MBB);
371
372 Visited[MBB] = Level;
373 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
374
375 for (MachineBasicBlock *Succ : MBB->successors()) {
376 if (Succ == DefBlock) {
377 if (MBB == VisitedPostDom)
378 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
379 else
380 FoundLoopLevel = std::min(FoundLoopLevel, Level);
381 continue;
382 }
383
384 if (Visited.try_emplace(Succ, ~0u).second) {
385 if (MBB == VisitedPostDom)
386 NextLevel.push_back(Succ);
387 else
388 Stack.push_back(Succ);
389 }
390 }
391 }
392
393 CommonDominators.push_back(VisitedDom);
394 }
395 };
396
397 } // End anonymous namespace.
398
399 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
400 false)
401 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
402 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
403 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
404 false)
405
406 char SILowerI1Copies::ID = 0;
407
408 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
409
createSILowerI1CopiesPass()410 FunctionPass *llvm::createSILowerI1CopiesPass() {
411 return new SILowerI1Copies();
412 }
413
createLaneMaskReg(MachineFunction & MF)414 static unsigned createLaneMaskReg(MachineFunction &MF) {
415 MachineRegisterInfo &MRI = MF.getRegInfo();
416 return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
417 }
418
insertUndefLaneMask(MachineBasicBlock & MBB)419 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
420 MachineFunction &MF = *MBB.getParent();
421 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
422 const SIInstrInfo *TII = ST.getInstrInfo();
423 unsigned UndefReg = createLaneMaskReg(MF);
424 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
425 UndefReg);
426 return UndefReg;
427 }
428
429 /// Lower all instructions that def or use vreg_1 registers.
430 ///
431 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
432 /// occur around inline assembly. We do this first, before vreg_1 registers
433 /// are changed to scalar mask registers.
434 ///
435 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
436 /// all others, because phi lowering looks through copies and can therefore
437 /// often make copy lowering unnecessary.
runOnMachineFunction(MachineFunction & TheMF)438 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
439 MF = &TheMF;
440 MRI = &MF->getRegInfo();
441 DT = &getAnalysis<MachineDominatorTree>();
442 PDT = &getAnalysis<MachinePostDominatorTree>();
443
444 ST = &MF->getSubtarget<GCNSubtarget>();
445 TII = ST->getInstrInfo();
446
447 lowerCopiesFromI1();
448 lowerPhis();
449 lowerCopiesToI1();
450
451 for (unsigned Reg : ConstrainRegs)
452 MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
453 ConstrainRegs.clear();
454
455 return true;
456 }
457
lowerCopiesFromI1()458 void SILowerI1Copies::lowerCopiesFromI1() {
459 SmallVector<MachineInstr *, 4> DeadCopies;
460
461 for (MachineBasicBlock &MBB : *MF) {
462 for (MachineInstr &MI : MBB) {
463 if (MI.getOpcode() != AMDGPU::COPY)
464 continue;
465
466 unsigned DstReg = MI.getOperand(0).getReg();
467 unsigned SrcReg = MI.getOperand(1).getReg();
468 if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
469 MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
470 continue;
471
472 if (isLaneMaskReg(DstReg) ||
473 (TargetRegisterInfo::isVirtualRegister(DstReg) &&
474 MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
475 continue;
476
477 // Copy into a 32-bit vector register.
478 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
479 DebugLoc DL = MI.getDebugLoc();
480
481 assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
482 assert(!MI.getOperand(0).getSubReg());
483
484 ConstrainRegs.insert(SrcReg);
485 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
486 .addImm(0)
487 .addImm(-1)
488 .addReg(SrcReg);
489 DeadCopies.push_back(&MI);
490 }
491
492 for (MachineInstr *MI : DeadCopies)
493 MI->eraseFromParent();
494 DeadCopies.clear();
495 }
496 }
497
lowerPhis()498 void SILowerI1Copies::lowerPhis() {
499 MachineSSAUpdater SSAUpdater(*MF);
500 LoopFinder LF(*DT, *PDT);
501 PhiIncomingAnalysis PIA(*PDT);
502 SmallVector<MachineInstr *, 4> DeadPhis;
503 SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
504 SmallVector<unsigned, 4> IncomingRegs;
505 SmallVector<unsigned, 4> IncomingUpdated;
506
507 for (MachineBasicBlock &MBB : *MF) {
508 LF.initialize(MBB);
509
510 for (MachineInstr &MI : MBB.phis()) {
511 unsigned DstReg = MI.getOperand(0).getReg();
512 if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
513 continue;
514
515 LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
516
517 MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
518
519 // Collect incoming values.
520 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
521 assert(i + 1 < MI.getNumOperands());
522 unsigned IncomingReg = MI.getOperand(i).getReg();
523 MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
524 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
525
526 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
527 IncomingReg = IncomingDef->getOperand(1).getReg();
528 assert(isLaneMaskReg(IncomingReg));
529 assert(!IncomingDef->getOperand(1).getSubReg());
530 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
531 continue;
532 } else {
533 assert(IncomingDef->isPHI());
534 }
535
536 IncomingBlocks.push_back(IncomingMBB);
537 IncomingRegs.push_back(IncomingReg);
538 }
539
540 // Phis in a loop that are observed outside the loop receive a simple but
541 // conservatively correct treatment.
542 MachineBasicBlock *PostDomBound = &MBB;
543 for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
544 PostDomBound =
545 PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
546 }
547
548 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
549
550 SSAUpdater.Initialize(DstReg);
551
552 if (FoundLoopLevel) {
553 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
554
555 for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
556 IncomingUpdated.push_back(createLaneMaskReg(*MF));
557 SSAUpdater.AddAvailableValue(IncomingBlocks[i],
558 IncomingUpdated.back());
559 }
560
561 for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
562 MachineBasicBlock &IMBB = *IncomingBlocks[i];
563 buildMergeLaneMasks(
564 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
565 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
566 }
567 } else {
568 // The phi is not observed from outside a loop. Use a more accurate
569 // lowering.
570 PIA.analyze(MBB, IncomingBlocks);
571
572 for (MachineBasicBlock *MBB : PIA.predecessors())
573 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
574
575 for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
576 MachineBasicBlock &IMBB = *IncomingBlocks[i];
577 if (PIA.isSource(IMBB)) {
578 IncomingUpdated.push_back(0);
579 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
580 } else {
581 IncomingUpdated.push_back(createLaneMaskReg(*MF));
582 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
583 }
584 }
585
586 for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
587 if (!IncomingUpdated[i])
588 continue;
589
590 MachineBasicBlock &IMBB = *IncomingBlocks[i];
591 buildMergeLaneMasks(
592 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
593 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
594 }
595 }
596
597 unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
598 if (NewReg != DstReg) {
599 MRI->replaceRegWith(NewReg, DstReg);
600
601 // Ensure that DstReg has a single def and mark the old PHI node for
602 // deletion.
603 MI.getOperand(0).setReg(NewReg);
604 DeadPhis.push_back(&MI);
605 }
606
607 IncomingBlocks.clear();
608 IncomingRegs.clear();
609 IncomingUpdated.clear();
610 }
611
612 for (MachineInstr *MI : DeadPhis)
613 MI->eraseFromParent();
614 DeadPhis.clear();
615 }
616 }
617
lowerCopiesToI1()618 void SILowerI1Copies::lowerCopiesToI1() {
619 MachineSSAUpdater SSAUpdater(*MF);
620 LoopFinder LF(*DT, *PDT);
621 SmallVector<MachineInstr *, 4> DeadCopies;
622
623 for (MachineBasicBlock &MBB : *MF) {
624 LF.initialize(MBB);
625
626 for (MachineInstr &MI : MBB) {
627 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
628 MI.getOpcode() != AMDGPU::COPY)
629 continue;
630
631 unsigned DstReg = MI.getOperand(0).getReg();
632 if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
633 MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
634 continue;
635
636 if (MRI->use_empty(DstReg)) {
637 DeadCopies.push_back(&MI);
638 continue;
639 }
640
641 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
642
643 MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
644 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
645 continue;
646
647 DebugLoc DL = MI.getDebugLoc();
648 unsigned SrcReg = MI.getOperand(1).getReg();
649 assert(!MI.getOperand(1).getSubReg());
650
651 if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
652 !isLaneMaskReg(SrcReg)) {
653 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
654 unsigned TmpReg = createLaneMaskReg(*MF);
655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
656 .addReg(SrcReg)
657 .addImm(0);
658 MI.getOperand(1).setReg(TmpReg);
659 SrcReg = TmpReg;
660 }
661
662 // Defs in a loop that are observed outside the loop must be transformed
663 // into appropriate bit manipulation.
664 MachineBasicBlock *PostDomBound = &MBB;
665 for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
666 PostDomBound =
667 PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
668 }
669
670 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
671 if (FoundLoopLevel) {
672 SSAUpdater.Initialize(DstReg);
673 SSAUpdater.AddAvailableValue(&MBB, DstReg);
674 LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
675
676 buildMergeLaneMasks(MBB, MI, DL, DstReg,
677 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
678 DeadCopies.push_back(&MI);
679 }
680 }
681
682 for (MachineInstr *MI : DeadCopies)
683 MI->eraseFromParent();
684 DeadCopies.clear();
685 }
686 }
687
isConstantLaneMask(unsigned Reg,bool & Val) const688 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
689 const MachineInstr *MI;
690 for (;;) {
691 MI = MRI->getUniqueVRegDef(Reg);
692 if (MI->getOpcode() != AMDGPU::COPY)
693 break;
694
695 Reg = MI->getOperand(1).getReg();
696 if (!TargetRegisterInfo::isVirtualRegister(Reg))
697 return false;
698 if (!isLaneMaskReg(Reg))
699 return false;
700 }
701
702 if (MI->getOpcode() != AMDGPU::S_MOV_B64)
703 return false;
704
705 if (!MI->getOperand(1).isImm())
706 return false;
707
708 int64_t Imm = MI->getOperand(1).getImm();
709 if (Imm == 0) {
710 Val = false;
711 return true;
712 }
713 if (Imm == -1) {
714 Val = true;
715 return true;
716 }
717
718 return false;
719 }
720
instrDefsUsesSCC(const MachineInstr & MI,bool & Def,bool & Use)721 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
722 Def = false;
723 Use = false;
724
725 for (const MachineOperand &MO : MI.operands()) {
726 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
727 if (MO.isUse())
728 Use = true;
729 else
730 Def = true;
731 }
732 }
733 }
734
735 /// Return a point at the end of the given \p MBB to insert SALU instructions
736 /// for lane mask calculation. Take terminators and SCC into account.
737 MachineBasicBlock::iterator
getSaluInsertionAtEnd(MachineBasicBlock & MBB) const738 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
739 auto InsertionPt = MBB.getFirstTerminator();
740 bool TerminatorsUseSCC = false;
741 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
742 bool DefsSCC;
743 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
744 if (TerminatorsUseSCC || DefsSCC)
745 break;
746 }
747
748 if (!TerminatorsUseSCC)
749 return InsertionPt;
750
751 while (InsertionPt != MBB.begin()) {
752 InsertionPt--;
753
754 bool DefSCC, UseSCC;
755 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
756 if (DefSCC)
757 return InsertionPt;
758 }
759
760 // We should have at least seen an IMPLICIT_DEF or COPY
761 llvm_unreachable("SCC used by terminator but no def in block");
762 }
763
buildMergeLaneMasks(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DstReg,unsigned PrevReg,unsigned CurReg)764 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
765 MachineBasicBlock::iterator I,
766 const DebugLoc &DL, unsigned DstReg,
767 unsigned PrevReg, unsigned CurReg) {
768 bool PrevVal;
769 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
770 bool CurVal;
771 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
772
773 if (PrevConstant && CurConstant) {
774 if (PrevVal == CurVal) {
775 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
776 } else if (CurVal) {
777 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
778 } else {
779 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
780 .addReg(AMDGPU::EXEC)
781 .addImm(-1);
782 }
783 return;
784 }
785
786 unsigned PrevMaskedReg = 0;
787 unsigned CurMaskedReg = 0;
788 if (!PrevConstant) {
789 if (CurConstant && CurVal) {
790 PrevMaskedReg = PrevReg;
791 } else {
792 PrevMaskedReg = createLaneMaskReg(*MF);
793 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
794 .addReg(PrevReg)
795 .addReg(AMDGPU::EXEC);
796 }
797 }
798 if (!CurConstant) {
799 // TODO: check whether CurReg is already masked by EXEC
800 if (PrevConstant && PrevVal) {
801 CurMaskedReg = CurReg;
802 } else {
803 CurMaskedReg = createLaneMaskReg(*MF);
804 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
805 .addReg(CurReg)
806 .addReg(AMDGPU::EXEC);
807 }
808 }
809
810 if (PrevConstant && !PrevVal) {
811 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
812 .addReg(CurMaskedReg);
813 } else if (CurConstant && !CurVal) {
814 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
815 .addReg(PrevMaskedReg);
816 } else if (PrevConstant && PrevVal) {
817 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
818 .addReg(CurMaskedReg)
819 .addReg(AMDGPU::EXEC);
820 } else {
821 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
822 .addReg(PrevMaskedReg)
823 .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
824 }
825 }
826