1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30 
31 #define DEBUG_TYPE "machine-scheduler"
32 
33 using namespace llvm;
34 
35 static cl::opt<bool>
36     DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
37                            cl::Hidden,
38                            cl::desc("Disable unclustred high register pressure "
39                                     "reduction scheduling stage."),
40                            cl::init(false));
41 static cl::opt<unsigned> ScheduleMetricBias(
42     "amdgpu-schedule-metric-bias", cl::Hidden,
43     cl::desc(
44         "Sets the bias which adds weight to occupancy vs latency. Set it to "
45         "100 to chase the occupancy only."),
46     cl::init(10));
47 
48 const unsigned ScheduleMetrics::ScaleFactor = 100;
49 
50 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
51     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
52       HasHighPressure(false) {}
53 
54 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
55   GenericScheduler::initialize(DAG);
56 
57   MF = &DAG->MF;
58 
59   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
60 
61   SGPRExcessLimit =
62       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
63   VGPRExcessLimit =
64       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
65 
66   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
67   // Set the initial TargetOccupnacy to the maximum occupancy that we can
68   // achieve for this function. This effectively sets a lower bound on the
69   // 'Critical' register limits in the scheduler.
70   TargetOccupancy = MFI.getOccupancy();
71   SGPRCriticalLimit =
72       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
73 
74   if (!KnownExcessRP) {
75     VGPRCriticalLimit =
76         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
77   } else {
78     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
79     // returns a reasonably small number for targets with lots of VGPRs, such
80     // as GFX10 and GFX11.
81     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
82                          "VGPRCriticalLimit calculation method.\n");
83 
84     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
85     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
86     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
87     VGPRBudget = std::max(VGPRBudget, Granule);
88     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
89   }
90 
91   // Subtract error margin and bias from register limits and avoid overflow.
92   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
93   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
94   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
95   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
96 
97   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
98                     << ", VGPRExcessLimit = " << VGPRExcessLimit
99                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
100                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
101 }
102 
103 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
104                                      bool AtTop,
105                                      const RegPressureTracker &RPTracker,
106                                      const SIRegisterInfo *SRI,
107                                      unsigned SGPRPressure,
108                                      unsigned VGPRPressure) {
109   Cand.SU = SU;
110   Cand.AtTop = AtTop;
111 
112   if (!DAG->isTrackingPressure())
113     return;
114 
115   // getDownwardPressure() and getUpwardPressure() make temporary changes to
116   // the tracker, so we need to pass those function a non-const copy.
117   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
118 
119   Pressure.clear();
120   MaxPressure.clear();
121 
122   if (AtTop)
123     TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
124   else {
125     // FIXME: I think for bottom up scheduling, the register pressure is cached
126     // and can be retrieved by DAG->getPressureDif(SU).
127     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
128   }
129 
130   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
131   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
132 
133   // If two instructions increase the pressure of different register sets
134   // by the same amount, the generic scheduler will prefer to schedule the
135   // instruction that increases the set with the least amount of registers,
136   // which in our case would be SGPRs.  This is rarely what we want, so
137   // when we report excess/critical register pressure, we do it either
138   // only for VGPRs or only for SGPRs.
139 
140   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
141   const unsigned MaxVGPRPressureInc = 16;
142   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
143   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
144 
145 
146   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
147   // to increase the likelihood we don't go over the limits.  We should improve
148   // the analysis to look through dependencies to find the path with the least
149   // register pressure.
150 
151   // We only need to update the RPDelta for instructions that increase register
152   // pressure. Instructions that decrease or keep reg pressure the same will be
153   // marked as RegExcess in tryCandidate() when they are compared with
154   // instructions that increase the register pressure.
155   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
156     HasHighPressure = true;
157     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
158     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
159   }
160 
161   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
162     HasHighPressure = true;
163     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
164     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
165   }
166 
167   // Register pressure is considered 'CRITICAL' if it is approaching a value
168   // that would reduce the wave occupancy for the execution unit.  When
169   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
170   // has the same cost, so we don't need to prefer one over the other.
171 
172   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
173   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
174 
175   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
176     HasHighPressure = true;
177     if (SGPRDelta > VGPRDelta) {
178       Cand.RPDelta.CriticalMax =
179         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
180       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
181     } else {
182       Cand.RPDelta.CriticalMax =
183         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
184       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
185     }
186   }
187 }
188 
189 // This function is mostly cut and pasted from
190 // GenericScheduler::pickNodeFromQueue()
191 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
192                                          const CandPolicy &ZonePolicy,
193                                          const RegPressureTracker &RPTracker,
194                                          SchedCandidate &Cand) {
195   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
196   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
197   unsigned SGPRPressure = 0;
198   unsigned VGPRPressure = 0;
199   if (DAG->isTrackingPressure()) {
200     SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
201     VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
202   }
203   ReadyQueue &Q = Zone.Available;
204   for (SUnit *SU : Q) {
205 
206     SchedCandidate TryCand(ZonePolicy);
207     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
208                   SGPRPressure, VGPRPressure);
209     // Pass SchedBoundary only when comparing nodes from the same boundary.
210     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
211     tryCandidate(Cand, TryCand, ZoneArg);
212     if (TryCand.Reason != NoCand) {
213       // Initialize resource delta if needed in case future heuristics query it.
214       if (TryCand.ResDelta == SchedResourceDelta())
215         TryCand.initResourceDelta(Zone.DAG, SchedModel);
216       Cand.setBest(TryCand);
217       LLVM_DEBUG(traceCandidate(Cand));
218     }
219   }
220 }
221 
222 // This function is mostly cut and pasted from
223 // GenericScheduler::pickNodeBidirectional()
224 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
225   // Schedule as far as possible in the direction of no choice. This is most
226   // efficient, but also provides the best heuristics for CriticalPSets.
227   if (SUnit *SU = Bot.pickOnlyChoice()) {
228     IsTopNode = false;
229     return SU;
230   }
231   if (SUnit *SU = Top.pickOnlyChoice()) {
232     IsTopNode = true;
233     return SU;
234   }
235   // Set the bottom-up policy based on the state of the current bottom zone and
236   // the instructions outside the zone, including the top zone.
237   CandPolicy BotPolicy;
238   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
239   // Set the top-down policy based on the state of the current top zone and
240   // the instructions outside the zone, including the bottom zone.
241   CandPolicy TopPolicy;
242   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
243 
244   // See if BotCand is still valid (because we previously scheduled from Top).
245   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
246   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
247       BotCand.Policy != BotPolicy) {
248     BotCand.reset(CandPolicy());
249     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
250     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
251   } else {
252     LLVM_DEBUG(traceCandidate(BotCand));
253 #ifndef NDEBUG
254     if (VerifyScheduling) {
255       SchedCandidate TCand;
256       TCand.reset(CandPolicy());
257       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
258       assert(TCand.SU == BotCand.SU &&
259              "Last pick result should correspond to re-picking right now");
260     }
261 #endif
262   }
263 
264   // Check if the top Q has a better candidate.
265   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
266   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
267       TopCand.Policy != TopPolicy) {
268     TopCand.reset(CandPolicy());
269     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
270     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
271   } else {
272     LLVM_DEBUG(traceCandidate(TopCand));
273 #ifndef NDEBUG
274     if (VerifyScheduling) {
275       SchedCandidate TCand;
276       TCand.reset(CandPolicy());
277       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
278       assert(TCand.SU == TopCand.SU &&
279            "Last pick result should correspond to re-picking right now");
280     }
281 #endif
282   }
283 
284   // Pick best from BotCand and TopCand.
285   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
286              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
287   SchedCandidate Cand = BotCand;
288   TopCand.Reason = NoCand;
289   tryCandidate(Cand, TopCand, nullptr);
290   if (TopCand.Reason != NoCand) {
291     Cand.setBest(TopCand);
292   }
293   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
294 
295   IsTopNode = Cand.AtTop;
296   return Cand.SU;
297 }
298 
299 // This function is mostly cut and pasted from
300 // GenericScheduler::pickNode()
301 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
302   if (DAG->top() == DAG->bottom()) {
303     assert(Top.Available.empty() && Top.Pending.empty() &&
304            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
305     return nullptr;
306   }
307   SUnit *SU;
308   do {
309     if (RegionPolicy.OnlyTopDown) {
310       SU = Top.pickOnlyChoice();
311       if (!SU) {
312         CandPolicy NoPolicy;
313         TopCand.reset(NoPolicy);
314         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
315         assert(TopCand.Reason != NoCand && "failed to find a candidate");
316         SU = TopCand.SU;
317       }
318       IsTopNode = true;
319     } else if (RegionPolicy.OnlyBottomUp) {
320       SU = Bot.pickOnlyChoice();
321       if (!SU) {
322         CandPolicy NoPolicy;
323         BotCand.reset(NoPolicy);
324         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
325         assert(BotCand.Reason != NoCand && "failed to find a candidate");
326         SU = BotCand.SU;
327       }
328       IsTopNode = false;
329     } else {
330       SU = pickNodeBidirectional(IsTopNode);
331     }
332   } while (SU->isScheduled);
333 
334   if (SU->isTopReady())
335     Top.removeReady(SU);
336   if (SU->isBottomReady())
337     Bot.removeReady(SU);
338 
339   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
340                     << *SU->getInstr());
341   return SU;
342 }
343 
344 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
345   assert(CurrentStage && CurrentStage != SchedStages.end());
346   return *CurrentStage;
347 }
348 
349 bool GCNSchedStrategy::advanceStage() {
350   assert(CurrentStage != SchedStages.end());
351   if (!CurrentStage)
352     CurrentStage = SchedStages.begin();
353   else
354     CurrentStage++;
355 
356   return CurrentStage != SchedStages.end();
357 }
358 
359 bool GCNSchedStrategy::hasNextStage() const {
360   assert(CurrentStage);
361   return std::next(CurrentStage) != SchedStages.end();
362 }
363 
364 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
365   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
366   return *std::next(CurrentStage);
367 }
368 
369 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
370     const MachineSchedContext *C)
371     : GCNSchedStrategy(C) {
372   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
373   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
374   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
375   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
376 }
377 
378 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
379     : GCNSchedStrategy(C) {
380   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
381 }
382 
383 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
384                                           SchedCandidate &TryCand,
385                                           SchedBoundary *Zone) const {
386   // Initialize the candidate if needed.
387   if (!Cand.isValid()) {
388     TryCand.Reason = NodeOrder;
389     return true;
390   }
391 
392   // Avoid spilling by exceeding the register limit.
393   if (DAG->isTrackingPressure() &&
394       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
395                   RegExcess, TRI, DAG->MF))
396     return TryCand.Reason != NoCand;
397 
398   // Bias PhysReg Defs and copies to their uses and defined respectively.
399   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
400                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
401     return TryCand.Reason != NoCand;
402 
403   bool SameBoundary = Zone != nullptr;
404   if (SameBoundary) {
405     // Prioritize instructions that read unbuffered resources by stall cycles.
406     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
407                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
408       return TryCand.Reason != NoCand;
409 
410     // Avoid critical resource consumption and balance the schedule.
411     TryCand.initResourceDelta(DAG, SchedModel);
412     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
413                 TryCand, Cand, ResourceReduce))
414       return TryCand.Reason != NoCand;
415     if (tryGreater(TryCand.ResDelta.DemandedResources,
416                    Cand.ResDelta.DemandedResources, TryCand, Cand,
417                    ResourceDemand))
418       return TryCand.Reason != NoCand;
419 
420     // Unconditionally try to reduce latency.
421     if (tryLatency(TryCand, Cand, *Zone))
422       return TryCand.Reason != NoCand;
423 
424     // Weak edges are for clustering and other constraints.
425     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
426                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
427       return TryCand.Reason != NoCand;
428   }
429 
430   // Keep clustered nodes together to encourage downstream peephole
431   // optimizations which may reduce resource requirements.
432   //
433   // This is a best effort to set things up for a post-RA pass. Optimizations
434   // like generating loads of multiple registers should ideally be done within
435   // the scheduler pass by combining the loads during DAG postprocessing.
436   const SUnit *CandNextClusterSU =
437       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
438   const SUnit *TryCandNextClusterSU =
439       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
440   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
441                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
442     return TryCand.Reason != NoCand;
443 
444   // Avoid increasing the max critical pressure in the scheduled region.
445   if (DAG->isTrackingPressure() &&
446       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
447                   TryCand, Cand, RegCritical, TRI, DAG->MF))
448     return TryCand.Reason != NoCand;
449 
450   // Avoid increasing the max pressure of the entire region.
451   if (DAG->isTrackingPressure() &&
452       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
453                   Cand, RegMax, TRI, DAG->MF))
454     return TryCand.Reason != NoCand;
455 
456   if (SameBoundary) {
457     // Fall through to original instruction order.
458     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
459         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
460       TryCand.Reason = NodeOrder;
461       return true;
462     }
463   }
464   return false;
465 }
466 
467 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
468     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
469     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
470       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
471       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
472 
473   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
474 }
475 
476 std::unique_ptr<GCNSchedStage>
477 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
478   switch (SchedStageID) {
479   case GCNSchedStageID::OccInitialSchedule:
480     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
481   case GCNSchedStageID::UnclusteredHighRPReschedule:
482     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
483   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
484     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
485   case GCNSchedStageID::PreRARematerialize:
486     return std::make_unique<PreRARematStage>(SchedStageID, *this);
487   case GCNSchedStageID::ILPInitialSchedule:
488     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
489   }
490 
491   llvm_unreachable("Unknown SchedStageID.");
492 }
493 
494 void GCNScheduleDAGMILive::schedule() {
495   // Collect all scheduling regions. The actual scheduling is performed in
496   // GCNScheduleDAGMILive::finalizeSchedule.
497   Regions.push_back(std::pair(RegionBegin, RegionEnd));
498 }
499 
500 GCNRegPressure
501 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
502   GCNDownwardRPTracker RPTracker(*LIS);
503   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
504   return RPTracker.moveMaxPressure();
505 }
506 
507 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
508                                                 const MachineBasicBlock *MBB) {
509   GCNDownwardRPTracker RPTracker(*LIS);
510 
511   // If the block has the only successor then live-ins of that successor are
512   // live-outs of the current block. We can reuse calculated live set if the
513   // successor will be sent to scheduling past current block.
514   const MachineBasicBlock *OnlySucc = nullptr;
515   if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
516     SlotIndexes *Ind = LIS->getSlotIndexes();
517     if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
518       OnlySucc = *MBB->succ_begin();
519   }
520 
521   // Scheduler sends regions from the end of the block upwards.
522   size_t CurRegion = RegionIdx;
523   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
524     if (Regions[CurRegion].first->getParent() != MBB)
525       break;
526   --CurRegion;
527 
528   auto I = MBB->begin();
529   auto LiveInIt = MBBLiveIns.find(MBB);
530   auto &Rgn = Regions[CurRegion];
531   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
532   if (LiveInIt != MBBLiveIns.end()) {
533     auto LiveIn = std::move(LiveInIt->second);
534     RPTracker.reset(*MBB->begin(), &LiveIn);
535     MBBLiveIns.erase(LiveInIt);
536   } else {
537     I = Rgn.first;
538     auto LRS = BBLiveInMap.lookup(NonDbgMI);
539 #ifdef EXPENSIVE_CHECKS
540     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
541 #endif
542     RPTracker.reset(*I, &LRS);
543   }
544 
545   for (;;) {
546     I = RPTracker.getNext();
547 
548     if (Regions[CurRegion].first == I || NonDbgMI == I) {
549       LiveIns[CurRegion] = RPTracker.getLiveRegs();
550       RPTracker.clearMaxPressure();
551     }
552 
553     if (Regions[CurRegion].second == I) {
554       Pressure[CurRegion] = RPTracker.moveMaxPressure();
555       if (CurRegion-- == RegionIdx)
556         break;
557     }
558     RPTracker.advanceToNext();
559     RPTracker.advanceBeforeNext();
560   }
561 
562   if (OnlySucc) {
563     if (I != MBB->end()) {
564       RPTracker.advanceToNext();
565       RPTracker.advance(MBB->end());
566     }
567     RPTracker.advanceBeforeNext();
568     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
569   }
570 }
571 
572 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
573 GCNScheduleDAGMILive::getBBLiveInMap() const {
574   assert(!Regions.empty());
575   std::vector<MachineInstr *> BBStarters;
576   BBStarters.reserve(Regions.size());
577   auto I = Regions.rbegin(), E = Regions.rend();
578   auto *BB = I->first->getParent();
579   do {
580     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
581     BBStarters.push_back(MI);
582     do {
583       ++I;
584     } while (I != E && I->first->getParent() == BB);
585   } while (I != E);
586   return getLiveRegMap(BBStarters, false /*After*/, *LIS);
587 }
588 
589 void GCNScheduleDAGMILive::finalizeSchedule() {
590   // Start actual scheduling here. This function is called by the base
591   // MachineScheduler after all regions have been recorded by
592   // GCNScheduleDAGMILive::schedule().
593   LiveIns.resize(Regions.size());
594   Pressure.resize(Regions.size());
595   RescheduleRegions.resize(Regions.size());
596   RegionsWithHighRP.resize(Regions.size());
597   RegionsWithExcessRP.resize(Regions.size());
598   RegionsWithMinOcc.resize(Regions.size());
599   RegionsWithIGLPInstrs.resize(Regions.size());
600   RescheduleRegions.set();
601   RegionsWithHighRP.reset();
602   RegionsWithExcessRP.reset();
603   RegionsWithMinOcc.reset();
604   RegionsWithIGLPInstrs.reset();
605 
606   runSchedStages();
607 }
608 
609 void GCNScheduleDAGMILive::runSchedStages() {
610   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
611 
612   if (!Regions.empty())
613     BBLiveInMap = getBBLiveInMap();
614 
615   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
616   while (S.advanceStage()) {
617     auto Stage = createSchedStage(S.getCurrentStage());
618     if (!Stage->initGCNSchedStage())
619       continue;
620 
621     for (auto Region : Regions) {
622       RegionBegin = Region.first;
623       RegionEnd = Region.second;
624       // Setup for scheduling the region and check whether it should be skipped.
625       if (!Stage->initGCNRegion()) {
626         Stage->advanceRegion();
627         exitRegion();
628         continue;
629       }
630 
631       ScheduleDAGMILive::schedule();
632       Stage->finalizeGCNRegion();
633     }
634 
635     Stage->finalizeGCNSchedStage();
636   }
637 }
638 
639 #ifndef NDEBUG
640 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
641   switch (StageID) {
642   case GCNSchedStageID::OccInitialSchedule:
643     OS << "Max Occupancy Initial Schedule";
644     break;
645   case GCNSchedStageID::UnclusteredHighRPReschedule:
646     OS << "Unclustered High Register Pressure Reschedule";
647     break;
648   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
649     OS << "Clustered Low Occupancy Reschedule";
650     break;
651   case GCNSchedStageID::PreRARematerialize:
652     OS << "Pre-RA Rematerialize";
653     break;
654   case GCNSchedStageID::ILPInitialSchedule:
655     OS << "Max ILP Initial Schedule";
656     break;
657   }
658 
659   return OS;
660 }
661 #endif
662 
663 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
664     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
665       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
666 
667 bool GCNSchedStage::initGCNSchedStage() {
668   if (!DAG.LIS)
669     return false;
670 
671   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
672   return true;
673 }
674 
675 bool UnclusteredHighRPStage::initGCNSchedStage() {
676   if (DisableUnclusterHighRP)
677     return false;
678 
679   if (!GCNSchedStage::initGCNSchedStage())
680     return false;
681 
682   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
683     return false;
684 
685   SavedMutations.swap(DAG.Mutations);
686   DAG.addMutation(createIGroupLPDAGMutation());
687 
688   InitialOccupancy = DAG.MinOccupancy;
689   // Aggressivly try to reduce register pressure in the unclustered high RP
690   // stage. Temporarily increase occupancy target in the region.
691   S.SGPRLimitBias = S.HighRPSGPRBias;
692   S.VGPRLimitBias = S.HighRPVGPRBias;
693   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
694     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
695 
696   LLVM_DEBUG(
697       dbgs()
698       << "Retrying function scheduling without clustering. "
699          "Aggressivly try to reduce register pressure to achieve occupancy "
700       << DAG.MinOccupancy << ".\n");
701 
702   return true;
703 }
704 
705 bool ClusteredLowOccStage::initGCNSchedStage() {
706   if (!GCNSchedStage::initGCNSchedStage())
707     return false;
708 
709   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
710   // been dropped. All regions will have already been scheduled with the ideal
711   // occupancy targets.
712   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
713     return false;
714 
715   LLVM_DEBUG(
716       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
717              << DAG.MinOccupancy << ".\n");
718   return true;
719 }
720 
721 bool PreRARematStage::initGCNSchedStage() {
722   if (!GCNSchedStage::initGCNSchedStage())
723     return false;
724 
725   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
726     return false;
727 
728   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
729   // Check maximum occupancy
730   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
731       DAG.MinOccupancy)
732     return false;
733 
734   // FIXME: This pass will invalidate cached MBBLiveIns for regions
735   // inbetween the defs and region we sinked the def to. Cached pressure
736   // for regions where a def is sinked from will also be invalidated. Will
737   // need to be fixed if there is another pass after this pass.
738   assert(!S.hasNextStage());
739 
740   collectRematerializableInstructions();
741   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
742     return false;
743 
744   LLVM_DEBUG(
745       dbgs() << "Retrying function scheduling with improved occupancy of "
746              << DAG.MinOccupancy << " from rematerializing\n");
747   return true;
748 }
749 
750 void GCNSchedStage::finalizeGCNSchedStage() {
751   DAG.finishBlock();
752   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
753 }
754 
755 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
756   SavedMutations.swap(DAG.Mutations);
757   S.SGPRLimitBias = S.VGPRLimitBias = 0;
758   if (DAG.MinOccupancy > InitialOccupancy) {
759     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
760       DAG.RegionsWithMinOcc[IDX] =
761           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
762 
763     LLVM_DEBUG(dbgs() << StageID
764                       << " stage successfully increased occupancy to "
765                       << DAG.MinOccupancy << '\n');
766   }
767 
768   GCNSchedStage::finalizeGCNSchedStage();
769 }
770 
771 bool GCNSchedStage::initGCNRegion() {
772   // Check whether this new region is also a new block.
773   if (DAG.RegionBegin->getParent() != CurrentMBB)
774     setupNewBlock();
775 
776   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
777   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
778 
779   // Skip empty scheduling regions (0 or 1 schedulable instructions).
780   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
781     return false;
782 
783   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
784   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
785                     << " " << CurrentMBB->getName()
786                     << "\n  From: " << *DAG.begin() << "    To: ";
787              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
788              else dbgs() << "End";
789              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
790 
791   // Save original instruction order before scheduling for possible revert.
792   Unsched.clear();
793   Unsched.reserve(DAG.NumRegionInstrs);
794   if (StageID == GCNSchedStageID::OccInitialSchedule ||
795       StageID == GCNSchedStageID::ILPInitialSchedule) {
796     for (auto &I : DAG) {
797       Unsched.push_back(&I);
798       if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
799           I.getOpcode() == AMDGPU::IGLP_OPT)
800         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
801     }
802   } else {
803     for (auto &I : DAG)
804       Unsched.push_back(&I);
805   }
806 
807   PressureBefore = DAG.Pressure[RegionIdx];
808 
809   LLVM_DEBUG(
810       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
811              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
812              << "Region live-in pressure:  "
813              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
814              << "Region register pressure: " << print(PressureBefore));
815 
816   S.HasHighPressure = false;
817   S.KnownExcessRP = isRegionWithExcessRP();
818 
819   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
820       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
821     SavedMutations.clear();
822     SavedMutations.swap(DAG.Mutations);
823     DAG.addMutation(createIGroupLPDAGMutation());
824   }
825 
826   return true;
827 }
828 
829 bool UnclusteredHighRPStage::initGCNRegion() {
830   // Only reschedule regions with the minimum occupancy or regions that may have
831   // spilling (excess register pressure).
832   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
833        DAG.MinOccupancy <= InitialOccupancy) &&
834       !DAG.RegionsWithExcessRP[RegionIdx])
835     return false;
836 
837   return GCNSchedStage::initGCNRegion();
838 }
839 
840 bool ClusteredLowOccStage::initGCNRegion() {
841   // We may need to reschedule this region if it wasn't rescheduled in the last
842   // stage, or if we found it was testing critical register pressure limits in
843   // the unclustered reschedule stage. The later is because we may not have been
844   // able to raise the min occupancy in the previous stage so the region may be
845   // overly constrained even if it was already rescheduled.
846   if (!DAG.RegionsWithHighRP[RegionIdx])
847     return false;
848 
849   return GCNSchedStage::initGCNRegion();
850 }
851 
852 bool PreRARematStage::initGCNRegion() {
853   if (!DAG.RescheduleRegions[RegionIdx])
854     return false;
855 
856   return GCNSchedStage::initGCNRegion();
857 }
858 
859 void GCNSchedStage::setupNewBlock() {
860   if (CurrentMBB)
861     DAG.finishBlock();
862 
863   CurrentMBB = DAG.RegionBegin->getParent();
864   DAG.startBlock(CurrentMBB);
865   // Get real RP for the region if it hasn't be calculated before. After the
866   // initial schedule stage real RP will be collected after scheduling.
867   if (StageID == GCNSchedStageID::OccInitialSchedule)
868     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
869 }
870 
871 void GCNSchedStage::finalizeGCNRegion() {
872   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
873   DAG.RescheduleRegions[RegionIdx] = false;
874   if (S.HasHighPressure)
875     DAG.RegionsWithHighRP[RegionIdx] = true;
876 
877   // Revert scheduling if we have dropped occupancy or there is some other
878   // reason that the original schedule is better.
879   checkScheduling();
880 
881   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
882       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
883     SavedMutations.swap(DAG.Mutations);
884 
885   DAG.exitRegion();
886   RegionIdx++;
887 }
888 
889 void GCNSchedStage::checkScheduling() {
890   // Check the results of scheduling.
891   PressureAfter = DAG.getRealRegPressure(RegionIdx);
892   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
893   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
894 
895   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
896       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
897     DAG.Pressure[RegionIdx] = PressureAfter;
898     DAG.RegionsWithMinOcc[RegionIdx] =
899         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
900 
901     // Early out if we have achieve the occupancy target.
902     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
903     return;
904   }
905 
906   unsigned TargetOccupancy =
907       std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
908   unsigned WavesAfter =
909       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
910   unsigned WavesBefore =
911       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
912   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
913                     << ", after " << WavesAfter << ".\n");
914 
915   // We may not be able to keep the current target occupancy because of the just
916   // scheduled region. We might still be able to revert scheduling if the
917   // occupancy before was higher, or if the current schedule has register
918   // pressure higher than the excess limits which could lead to more spilling.
919   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
920 
921   // Allow memory bound functions to drop to 4 waves if not limited by an
922   // attribute.
923   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
924       WavesAfter >= MFI.getMinAllowedOccupancy()) {
925     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
926                       << MFI.getMinAllowedOccupancy() << " waves\n");
927     NewOccupancy = WavesAfter;
928   }
929 
930   if (NewOccupancy < DAG.MinOccupancy) {
931     DAG.MinOccupancy = NewOccupancy;
932     MFI.limitOccupancy(DAG.MinOccupancy);
933     DAG.RegionsWithMinOcc.reset();
934     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
935                       << DAG.MinOccupancy << ".\n");
936   }
937 
938   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
939   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
940   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
941       PressureAfter.getAGPRNum() > MaxVGPRs ||
942       PressureAfter.getSGPRNum() > MaxSGPRs) {
943     DAG.RescheduleRegions[RegionIdx] = true;
944     DAG.RegionsWithHighRP[RegionIdx] = true;
945     DAG.RegionsWithExcessRP[RegionIdx] = true;
946   }
947 
948   // Revert if this region's schedule would cause a drop in occupancy or
949   // spilling.
950   if (shouldRevertScheduling(WavesAfter)) {
951     revertScheduling();
952   } else {
953     DAG.Pressure[RegionIdx] = PressureAfter;
954     DAG.RegionsWithMinOcc[RegionIdx] =
955         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
956   }
957 }
958 
959 unsigned
960 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
961                                       DenseMap<unsigned, unsigned> &ReadyCycles,
962                                       const TargetSchedModel &SM) {
963   unsigned ReadyCycle = CurrCycle;
964   for (auto &D : SU.Preds) {
965     if (D.isAssignedRegDep()) {
966       MachineInstr *DefMI = D.getSUnit()->getInstr();
967       unsigned Latency = SM.computeInstrLatency(DefMI);
968       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
969       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
970     }
971   }
972   ReadyCycles[SU.NodeNum] = ReadyCycle;
973   return ReadyCycle;
974 }
975 
976 #ifndef NDEBUG
977 struct EarlierIssuingCycle {
978   bool operator()(std::pair<MachineInstr *, unsigned> A,
979                   std::pair<MachineInstr *, unsigned> B) const {
980     return A.second < B.second;
981   }
982 };
983 
984 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
985                                         EarlierIssuingCycle> &ReadyCycles) {
986   if (ReadyCycles.empty())
987     return;
988   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
989   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
990          << " ##################\n# Cycle #\t\t\tInstruction          "
991             "             "
992             "                            \n";
993   unsigned IPrev = 1;
994   for (auto &I : ReadyCycles) {
995     if (I.second > IPrev + 1)
996       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
997              << " CYCLES DETECTED ******************************\n\n";
998     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
999     IPrev = I.second;
1000   }
1001 }
1002 #endif
1003 
1004 ScheduleMetrics
1005 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1006 #ifndef NDEBUG
1007   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1008       ReadyCyclesSorted;
1009 #endif
1010   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1011   unsigned SumBubbles = 0;
1012   DenseMap<unsigned, unsigned> ReadyCycles;
1013   unsigned CurrCycle = 0;
1014   for (auto &SU : InputSchedule) {
1015     unsigned ReadyCycle =
1016         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1017     SumBubbles += ReadyCycle - CurrCycle;
1018 #ifndef NDEBUG
1019     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1020 #endif
1021     CurrCycle = ++ReadyCycle;
1022   }
1023 #ifndef NDEBUG
1024   LLVM_DEBUG(
1025       printScheduleModel(ReadyCyclesSorted);
1026       dbgs() << "\n\t"
1027              << "Metric: "
1028              << (SumBubbles
1029                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1030                      : 1)
1031              << "\n\n");
1032 #endif
1033 
1034   return ScheduleMetrics(CurrCycle, SumBubbles);
1035 }
1036 
1037 ScheduleMetrics
1038 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1039 #ifndef NDEBUG
1040   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1041       ReadyCyclesSorted;
1042 #endif
1043   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1044   unsigned SumBubbles = 0;
1045   DenseMap<unsigned, unsigned> ReadyCycles;
1046   unsigned CurrCycle = 0;
1047   for (auto &MI : DAG) {
1048     SUnit *SU = DAG.getSUnit(&MI);
1049     if (!SU)
1050       continue;
1051     unsigned ReadyCycle =
1052         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1053     SumBubbles += ReadyCycle - CurrCycle;
1054 #ifndef NDEBUG
1055     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1056 #endif
1057     CurrCycle = ++ReadyCycle;
1058   }
1059 #ifndef NDEBUG
1060   LLVM_DEBUG(
1061       printScheduleModel(ReadyCyclesSorted);
1062       dbgs() << "\n\t"
1063              << "Metric: "
1064              << (SumBubbles
1065                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1066                      : 1)
1067              << "\n\n");
1068 #endif
1069 
1070   return ScheduleMetrics(CurrCycle, SumBubbles);
1071 }
1072 
1073 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1074   if (WavesAfter < DAG.MinOccupancy)
1075     return true;
1076 
1077   return false;
1078 }
1079 
1080 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1081   if (PressureAfter == PressureBefore)
1082     return false;
1083 
1084   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1085     return true;
1086 
1087   if (mayCauseSpilling(WavesAfter))
1088     return true;
1089 
1090   return false;
1091 }
1092 
1093 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1094   // If RP is not reduced in the unclustred reschedule stage, revert to the
1095   // old schedule.
1096   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1097        mayCauseSpilling(WavesAfter)) ||
1098       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1099     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1100     return true;
1101   }
1102 
1103   LLVM_DEBUG(
1104       dbgs()
1105       << "\n\t      *** In shouldRevertScheduling ***\n"
1106       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1107   ScheduleMetrics MBefore =
1108       getScheduleMetrics(DAG.SUnits);
1109   LLVM_DEBUG(
1110       dbgs()
1111       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1112   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1113   unsigned OldMetric = MBefore.getMetric();
1114   unsigned NewMetric = MAfter.getMetric();
1115   unsigned WavesBefore =
1116       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1117   unsigned Profit =
1118       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1119        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1120        NewMetric) /
1121       ScheduleMetrics::ScaleFactor;
1122   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1123                     << MAfter << "Profit: " << Profit << "\n");
1124   return Profit < ScheduleMetrics::ScaleFactor;
1125 }
1126 
1127 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1128   if (PressureAfter == PressureBefore)
1129     return false;
1130 
1131   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1132     return true;
1133 
1134   if (mayCauseSpilling(WavesAfter))
1135     return true;
1136 
1137   return false;
1138 }
1139 
1140 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1141   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1142     return true;
1143 
1144   if (mayCauseSpilling(WavesAfter))
1145     return true;
1146 
1147   return false;
1148 }
1149 
1150 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1151   if (mayCauseSpilling(WavesAfter))
1152     return true;
1153 
1154   return false;
1155 }
1156 
1157 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1158   if (WavesAfter <= MFI.getMinWavesPerEU() &&
1159       !PressureAfter.less(ST, PressureBefore) &&
1160       isRegionWithExcessRP()) {
1161     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1162     return true;
1163   }
1164 
1165   return false;
1166 }
1167 
1168 void GCNSchedStage::revertScheduling() {
1169   DAG.RegionsWithMinOcc[RegionIdx] =
1170       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1171   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1172   DAG.RescheduleRegions[RegionIdx] =
1173       S.hasNextStage() &&
1174       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1175   DAG.RegionEnd = DAG.RegionBegin;
1176   int SkippedDebugInstr = 0;
1177   for (MachineInstr *MI : Unsched) {
1178     if (MI->isDebugInstr()) {
1179       ++SkippedDebugInstr;
1180       continue;
1181     }
1182 
1183     if (MI->getIterator() != DAG.RegionEnd) {
1184       DAG.BB->remove(MI);
1185       DAG.BB->insert(DAG.RegionEnd, MI);
1186       if (!MI->isDebugInstr())
1187         DAG.LIS->handleMove(*MI, true);
1188     }
1189 
1190     // Reset read-undef flags and update them later.
1191     for (auto &Op : MI->operands())
1192       if (Op.isReg() && Op.isDef())
1193         Op.setIsUndef(false);
1194     RegisterOperands RegOpers;
1195     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1196     if (!MI->isDebugInstr()) {
1197       if (DAG.ShouldTrackLaneMasks) {
1198         // Adjust liveness and add missing dead+read-undef flags.
1199         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1200         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1201       } else {
1202         // Adjust for missing dead-def flags.
1203         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1204       }
1205     }
1206     DAG.RegionEnd = MI->getIterator();
1207     ++DAG.RegionEnd;
1208     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1209   }
1210 
1211   // After reverting schedule, debug instrs will now be at the end of the block
1212   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1213   // pass debug instrs to the actual end of the scheduling region.
1214   while (SkippedDebugInstr-- > 0)
1215     ++DAG.RegionEnd;
1216 
1217   // If Unsched.front() instruction is a debug instruction, this will actually
1218   // shrink the region since we moved all debug instructions to the end of the
1219   // block. Find the first instruction that is not a debug instruction.
1220   DAG.RegionBegin = Unsched.front()->getIterator();
1221   if (DAG.RegionBegin->isDebugInstr()) {
1222     for (MachineInstr *MI : Unsched) {
1223       if (MI->isDebugInstr())
1224         continue;
1225       DAG.RegionBegin = MI->getIterator();
1226       break;
1227     }
1228   }
1229 
1230   // Then move the debug instructions back into their correct place and set
1231   // RegionBegin and RegionEnd if needed.
1232   DAG.placeDebugValues();
1233 
1234   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1235 }
1236 
1237 void PreRARematStage::collectRematerializableInstructions() {
1238   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1239   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1240     Register Reg = Register::index2VirtReg(I);
1241     if (!DAG.LIS->hasInterval(Reg))
1242       continue;
1243 
1244     // TODO: Handle AGPR and SGPR rematerialization
1245     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1246         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1247       continue;
1248 
1249     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1250     MachineInstr *Def = Op->getParent();
1251     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1252       continue;
1253 
1254     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1255     if (Def->getParent() == UseI->getParent())
1256       continue;
1257 
1258     // We are only collecting defs that are defined in another block and are
1259     // live-through or used inside regions at MinOccupancy. This means that the
1260     // register must be in the live-in set for the region.
1261     bool AddedToRematList = false;
1262     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1263       auto It = DAG.LiveIns[I].find(Reg);
1264       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1265         if (DAG.RegionsWithMinOcc[I]) {
1266           RematerializableInsts[I][Def] = UseI;
1267           AddedToRematList = true;
1268         }
1269 
1270         // Collect regions with rematerializable reg as live-in to avoid
1271         // searching later when updating RP.
1272         RematDefToLiveInRegions[Def].push_back(I);
1273       }
1274     }
1275     if (!AddedToRematList)
1276       RematDefToLiveInRegions.erase(Def);
1277   }
1278 }
1279 
1280 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1281                                               const TargetInstrInfo *TII) {
1282   // Temporary copies of cached variables we will be modifying and replacing if
1283   // sinking succeeds.
1284   SmallVector<
1285       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1286       NewRegions;
1287   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1288   DenseMap<unsigned, GCNRegPressure> NewPressure;
1289   BitVector NewRescheduleRegions;
1290   LiveIntervals *LIS = DAG.LIS;
1291 
1292   NewRegions.resize(DAG.Regions.size());
1293   NewRescheduleRegions.resize(DAG.Regions.size());
1294 
1295   // Collect only regions that has a rematerializable def as a live-in.
1296   SmallSet<unsigned, 16> ImpactedRegions;
1297   for (const auto &It : RematDefToLiveInRegions)
1298     ImpactedRegions.insert(It.second.begin(), It.second.end());
1299 
1300   // Make copies of register pressure and live-ins cache that will be updated
1301   // as we rematerialize.
1302   for (auto Idx : ImpactedRegions) {
1303     NewPressure[Idx] = DAG.Pressure[Idx];
1304     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1305   }
1306   NewRegions = DAG.Regions;
1307   NewRescheduleRegions.reset();
1308 
1309   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1310   bool Improved = false;
1311   for (auto I : ImpactedRegions) {
1312     if (!DAG.RegionsWithMinOcc[I])
1313       continue;
1314 
1315     Improved = false;
1316     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1317     int SGPRUsage = NewPressure[I].getSGPRNum();
1318 
1319     // TODO: Handle occupancy drop due to AGPR and SGPR.
1320     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1321     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1322       break;
1323 
1324     // The occupancy of this region could have been improved by a previous
1325     // iteration's sinking of defs.
1326     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1327       NewRescheduleRegions[I] = true;
1328       Improved = true;
1329       continue;
1330     }
1331 
1332     // First check if we have enough trivially rematerializable instructions to
1333     // improve occupancy. Optimistically assume all instructions we are able to
1334     // sink decreased RP.
1335     int TotalSinkableRegs = 0;
1336     for (const auto &It : RematerializableInsts[I]) {
1337       MachineInstr *Def = It.first;
1338       Register DefReg = Def->getOperand(0).getReg();
1339       TotalSinkableRegs +=
1340           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1341     }
1342     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1343     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1344     // If in the most optimistic scenario, we cannot improve occupancy, then do
1345     // not attempt to sink any instructions.
1346     if (OptimisticOccupancy <= DAG.MinOccupancy)
1347       break;
1348 
1349     unsigned ImproveOccupancy = 0;
1350     SmallVector<MachineInstr *, 4> SinkedDefs;
1351     for (auto &It : RematerializableInsts[I]) {
1352       MachineInstr *Def = It.first;
1353       MachineBasicBlock::iterator InsertPos =
1354           MachineBasicBlock::iterator(It.second);
1355       Register Reg = Def->getOperand(0).getReg();
1356       // Rematerialize MI to its use block. Since we are only rematerializing
1357       // instructions that do not have any virtual reg uses, we do not need to
1358       // call LiveRangeEdit::allUsesAvailableAt() and
1359       // LiveRangeEdit::canRematerializeAt().
1360       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1361                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1362       MachineInstr *NewMI = &*std::prev(InsertPos);
1363       LIS->InsertMachineInstrInMaps(*NewMI);
1364       LIS->removeInterval(Reg);
1365       LIS->createAndComputeVirtRegInterval(Reg);
1366       InsertedMIToOldDef[NewMI] = Def;
1367 
1368       // Update region boundaries in scheduling region we sinked from since we
1369       // may sink an instruction that was at the beginning or end of its region
1370       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1371                                  /*Removing =*/true);
1372 
1373       // Update region boundaries in region we sinked to.
1374       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1375 
1376       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1377       // FIXME: Also update cached pressure for where the def was sinked from.
1378       // Update RP for all regions that has this reg as a live-in and remove
1379       // the reg from all regions as a live-in.
1380       for (auto Idx : RematDefToLiveInRegions[Def]) {
1381         NewLiveIns[Idx].erase(Reg);
1382         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1383           // Def is live-through and not used in this block.
1384           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1385         } else {
1386           // Def is used and rematerialized into this block.
1387           GCNDownwardRPTracker RPT(*LIS);
1388           auto *NonDbgMI = &*skipDebugInstructionsForward(
1389               NewRegions[Idx].first, NewRegions[Idx].second);
1390           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1391           RPT.advance(NewRegions[Idx].second);
1392           NewPressure[Idx] = RPT.moveMaxPressure();
1393         }
1394       }
1395 
1396       SinkedDefs.push_back(Def);
1397       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1398       if (ImproveOccupancy > DAG.MinOccupancy)
1399         break;
1400     }
1401 
1402     // Remove defs we just sinked from all regions' list of sinkable defs
1403     for (auto &Def : SinkedDefs)
1404       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1405         RematerializableInsts[TrackedIdx].erase(Def);
1406 
1407     if (ImproveOccupancy <= DAG.MinOccupancy)
1408       break;
1409 
1410     NewRescheduleRegions[I] = true;
1411     Improved = true;
1412   }
1413 
1414   if (!Improved) {
1415     // Occupancy was not improved for all regions that were at MinOccupancy.
1416     // Undo sinking and remove newly rematerialized instructions.
1417     for (auto &Entry : InsertedMIToOldDef) {
1418       MachineInstr *MI = Entry.first;
1419       MachineInstr *OldMI = Entry.second;
1420       Register Reg = MI->getOperand(0).getReg();
1421       LIS->RemoveMachineInstrFromMaps(*MI);
1422       MI->eraseFromParent();
1423       OldMI->clearRegisterDeads(Reg);
1424       LIS->removeInterval(Reg);
1425       LIS->createAndComputeVirtRegInterval(Reg);
1426     }
1427     return false;
1428   }
1429 
1430   // Occupancy was improved for all regions.
1431   for (auto &Entry : InsertedMIToOldDef) {
1432     MachineInstr *MI = Entry.first;
1433     MachineInstr *OldMI = Entry.second;
1434 
1435     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1436     DAG.BBLiveInMap.erase(OldMI);
1437 
1438     // Remove OldMI and update LIS
1439     Register Reg = MI->getOperand(0).getReg();
1440     LIS->RemoveMachineInstrFromMaps(*OldMI);
1441     OldMI->eraseFromParent();
1442     LIS->removeInterval(Reg);
1443     LIS->createAndComputeVirtRegInterval(Reg);
1444   }
1445 
1446   // Update live-ins, register pressure, and regions caches.
1447   for (auto Idx : ImpactedRegions) {
1448     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1449     DAG.Pressure[Idx] = NewPressure[Idx];
1450     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1451   }
1452   DAG.Regions = NewRegions;
1453   DAG.RescheduleRegions = NewRescheduleRegions;
1454 
1455   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1456   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1457 
1458   return true;
1459 }
1460 
1461 // Copied from MachineLICM
1462 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1463   if (!DAG.TII->isTriviallyReMaterializable(MI))
1464     return false;
1465 
1466   for (const MachineOperand &MO : MI.operands())
1467     if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
1468       return false;
1469 
1470   return true;
1471 }
1472 
1473 // When removing, we will have to check both beginning and ending of the region.
1474 // When inserting, we will only have to check if we are inserting NewMI in front
1475 // of a scheduling region and do not need to check the ending since we will only
1476 // ever be inserting before an already existing MI.
1477 void GCNScheduleDAGMILive::updateRegionBoundaries(
1478     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1479                               MachineBasicBlock::iterator>> &RegionBoundaries,
1480     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1481   unsigned I = 0, E = RegionBoundaries.size();
1482   // Search for first region of the block where MI is located
1483   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1484     ++I;
1485 
1486   for (; I != E; ++I) {
1487     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1488       return;
1489 
1490     if (Removing && MI == RegionBoundaries[I].first &&
1491         MI == RegionBoundaries[I].second) {
1492       // MI is in a region with size 1, after removing, the region will be
1493       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1494       RegionBoundaries[I] =
1495           std::pair(MI->getParent()->end(), MI->getParent()->end());
1496       return;
1497     }
1498     if (MI == RegionBoundaries[I].first) {
1499       if (Removing)
1500         RegionBoundaries[I] =
1501             std::pair(std::next(MI), RegionBoundaries[I].second);
1502       else
1503         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1504         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1505                                         RegionBoundaries[I].second);
1506       return;
1507     }
1508     if (Removing && MI == RegionBoundaries[I].second) {
1509       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1510       return;
1511     }
1512   }
1513 }
1514 
1515 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1516   return std::any_of(
1517       DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
1518         unsigned Opc = MI->getOpcode();
1519         return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1520       });
1521 }
1522 
1523 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1524     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1525     bool RemoveKillFlags)
1526     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1527 
1528 void GCNPostScheduleDAGMILive::schedule() {
1529   HasIGLPInstrs = hasIGLPInstrs(this);
1530   if (HasIGLPInstrs) {
1531     SavedMutations.clear();
1532     SavedMutations.swap(Mutations);
1533     addMutation(createIGroupLPDAGMutation());
1534   }
1535 
1536   ScheduleDAGMI::schedule();
1537 }
1538 
1539 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1540   if (HasIGLPInstrs)
1541     SavedMutations.swap(Mutations);
1542 
1543   ScheduleDAGMI::finalizeSchedule();
1544 }
1545