1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Machine Scheduler interface
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
16 
17 #include "SIInstrInfo.h"
18 #include "llvm/CodeGen/MachineBasicBlock.h"
19 #include "llvm/CodeGen/MachineScheduler.h"
20 #include "llvm/CodeGen/RegisterPressure.h"
21 #include "llvm/CodeGen/ScheduleDAG.h"
22 #include <cassert>
23 #include <cstdint>
24 #include <map>
25 #include <memory>
26 #include <set>
27 #include <vector>
28 
29 namespace llvm {
30 
31 enum SIScheduleCandReason {
32   NoCand,
33   RegUsage,
34   Latency,
35   Successor,
36   Depth,
37   NodeOrder
38 };
39 
40 struct SISchedulerCandidate {
41   // The reason for this candidate.
42   SIScheduleCandReason Reason = NoCand;
43 
44   // Set of reasons that apply to multiple candidates.
45   uint32_t RepeatReasonSet = 0;
46 
47   SISchedulerCandidate() = default;
48 
isRepeatSISchedulerCandidate49   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
setRepeatSISchedulerCandidate50   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
51 };
52 
53 class SIScheduleDAGMI;
54 class SIScheduleBlockCreator;
55 
56 enum SIScheduleBlockLinkKind {
57   NoData,
58   Data
59 };
60 
61 class SIScheduleBlock {
62   SIScheduleDAGMI *DAG;
63   SIScheduleBlockCreator *BC;
64 
65   std::vector<SUnit*> SUnits;
66   std::map<unsigned, unsigned> NodeNum2Index;
67   std::vector<SUnit*> TopReadySUs;
68   std::vector<SUnit*> ScheduledSUnits;
69 
70   /// The top of the unscheduled zone.
71   IntervalPressure TopPressure;
72   RegPressureTracker TopRPTracker;
73 
74   // Pressure: number of said class of registers needed to
75   // store the live virtual and real registers.
76   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
77   // Pressure of additional registers required inside the block.
78   std::vector<unsigned> InternalAdditionnalPressure;
79   // Pressure of input and output registers
80   std::vector<unsigned> LiveInPressure;
81   std::vector<unsigned> LiveOutPressure;
82   // Registers required by the block, and outputs.
83   // We do track only virtual registers.
84   // Note that some registers are not 32 bits,
85   // and thus the pressure is not equal
86   // to the number of live registers.
87   std::set<unsigned> LiveInRegs;
88   std::set<unsigned> LiveOutRegs;
89 
90   bool Scheduled = false;
91   bool HighLatencyBlock = false;
92 
93   std::vector<unsigned> HasLowLatencyNonWaitedParent;
94 
95   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
96   unsigned ID;
97 
98   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
99   // All blocks successors, and the kind of link
100   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
101   unsigned NumHighLatencySuccessors = 0;
102 
103 public:
SIScheduleBlock(SIScheduleDAGMI * DAG,SIScheduleBlockCreator * BC,unsigned ID)104   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
105                   unsigned ID):
106     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
107 
108   ~SIScheduleBlock() = default;
109 
getID()110   unsigned getID() const { return ID; }
111 
112   /// Functions for Block construction.
113   void addUnit(SUnit *SU);
114 
115   // When all SUs have been added.
116   void finalizeUnits();
117 
118   // Add block pred, which has instruction predecessor of SU.
119   void addPred(SIScheduleBlock *Pred);
120   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
121 
getPreds()122   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
123   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
getSuccs()124     getSuccs() const { return Succs; }
125 
126   unsigned Height;  // Maximum topdown path length to block without outputs
127   unsigned Depth;   // Maximum bottomup path length to block without inputs
128 
getNumHighLatencySuccessors()129   unsigned getNumHighLatencySuccessors() const {
130     return NumHighLatencySuccessors;
131   }
132 
isHighLatencyBlock()133   bool isHighLatencyBlock() { return HighLatencyBlock; }
134 
135   // This is approximative.
136   // Ideally should take into accounts some instructions (rcp, etc)
137   // are 4 times slower.
getCost()138   int getCost() { return SUnits.size(); }
139 
140   // The block Predecessors and Successors must be all registered
141   // before fastSchedule().
142   // Fast schedule with no particular requirement.
143   void fastSchedule();
144 
getScheduledUnits()145   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
146 
147   // Complete schedule that will try to minimize reg pressure and
148   // low latencies, and will fill liveins and liveouts.
149   // Needs all MIs to be grouped between BeginBlock and EndBlock.
150   // The MIs can be moved after the scheduling,
151   // it is just used to allow correct track of live registers.
152   void schedule(MachineBasicBlock::iterator BeginBlock,
153                 MachineBasicBlock::iterator EndBlock);
154 
isScheduled()155   bool isScheduled() { return Scheduled; }
156 
157   // Needs the block to be scheduled inside
158   // TODO: find a way to compute it.
getInternalAdditionnalRegUsage()159   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
160     return InternalAdditionnalPressure;
161   }
162 
getInRegs()163   std::set<unsigned> &getInRegs() { return LiveInRegs; }
getOutRegs()164   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
165 
166   void printDebug(bool Full);
167 
168 private:
169   struct SISchedCandidate : SISchedulerCandidate {
170     // The best SUnit candidate.
171     SUnit *SU = nullptr;
172 
173     unsigned SGPRUsage;
174     unsigned VGPRUsage;
175     bool IsLowLatency;
176     unsigned LowLatencyOffset;
177     bool HasLowLatencyNonWaitedParent;
178 
179     SISchedCandidate() = default;
180 
isValidSISchedCandidate181     bool isValid() const { return SU; }
182 
183     // Copy the status of another candidate without changing policy.
setBestSISchedCandidate184     void setBest(SISchedCandidate &Best) {
185       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
186       SU = Best.SU;
187       Reason = Best.Reason;
188       SGPRUsage = Best.SGPRUsage;
189       VGPRUsage = Best.VGPRUsage;
190       IsLowLatency = Best.IsLowLatency;
191       LowLatencyOffset = Best.LowLatencyOffset;
192       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
193     }
194   };
195 
196   void undoSchedule();
197 
198   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
199   void releaseSucc(SUnit *SU, SDep *SuccEdge);
200   // InOrOutBlock: restrict to links pointing inside the block (true),
201   // or restrict to links pointing outside the block (false).
202   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
203 
204   void nodeScheduled(SUnit *SU);
205   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
206   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
207   SUnit* pickNode();
208   void traceCandidate(const SISchedCandidate &Cand);
209   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
210                        MachineBasicBlock::iterator EndBlock);
211 };
212 
213 struct SIScheduleBlocks {
214   std::vector<SIScheduleBlock*> Blocks;
215   std::vector<int> TopDownIndex2Block;
216   std::vector<int> TopDownBlock2Index;
217 };
218 
219 enum SISchedulerBlockCreatorVariant {
220   LatenciesAlone,
221   LatenciesGrouped,
222   LatenciesAlonePlusConsecutive
223 };
224 
225 class SIScheduleBlockCreator {
226   SIScheduleDAGMI *DAG;
227   // unique_ptr handles freeing memory for us.
228   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
229   std::map<SISchedulerBlockCreatorVariant,
230            SIScheduleBlocks> Blocks;
231   std::vector<SIScheduleBlock*> CurrentBlocks;
232   std::vector<int> Node2CurrentBlock;
233 
234   // Topological sort
235   // Maps topological index to the node number.
236   std::vector<int> TopDownIndex2Block;
237   std::vector<int> TopDownBlock2Index;
238   std::vector<int> BottomUpIndex2Block;
239 
240   // 0 -> Color not given.
241   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
242   // Above -> Other groups.
243   int NextReservedID;
244   int NextNonReservedID;
245   std::vector<int> CurrentColoring;
246   std::vector<int> CurrentTopDownReservedDependencyColoring;
247   std::vector<int> CurrentBottomUpReservedDependencyColoring;
248 
249 public:
250   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
251 
252   SIScheduleBlocks
253   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
254 
255   bool isSUInBlock(SUnit *SU, unsigned ID);
256 
257 private:
258   // Give a Reserved color to every high latency.
259   void colorHighLatenciesAlone();
260 
261   // Create groups of high latencies with a Reserved color.
262   void colorHighLatenciesGroups();
263 
264   // Compute coloring for topdown and bottom traversals with
265   // different colors depending on dependencies on Reserved colors.
266   void colorComputeReservedDependencies();
267 
268   // Give color to all non-colored SUs according to Reserved groups dependencies.
269   void colorAccordingToReservedDependencies();
270 
271   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
272   // The new colors are computed according to the dependencies on the other blocks
273   // formed with colorAccordingToReservedDependencies.
274   void colorEndsAccordingToDependencies();
275 
276   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
277   void colorForceConsecutiveOrderInGroup();
278 
279   // Merge Constant loads that have all their users into another group to the group.
280   // (TODO: else if all their users depend on the same group, put them there)
281   void colorMergeConstantLoadsNextGroup();
282 
283   // Merge SUs that have all their users into another group to the group
284   void colorMergeIfPossibleNextGroup();
285 
286   // Merge SUs that have all their users into another group to the group,
287   // but only for Reserved groups.
288   void colorMergeIfPossibleNextGroupOnlyForReserved();
289 
290   // Merge SUs that have all their users into another group to the group,
291   // but only if the group is no more than a few SUs.
292   void colorMergeIfPossibleSmallGroupsToNextGroup();
293 
294   // Divides Blocks with important size.
295   // Idea of implementation: attribute new colors depending on topdown and
296   // bottom up links to other blocks.
297   void cutHugeBlocks();
298 
299   // Put in one group all instructions with no users in this scheduling region
300   // (we'd want these groups be at the end).
301   void regroupNoUserInstructions();
302 
303   // Give Reserved color to export instructions
304   void colorExports();
305 
306   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
307 
308   void topologicalSort();
309 
310   void scheduleInsideBlocks();
311 
312   void fillStats();
313 };
314 
315 enum SISchedulerBlockSchedulerVariant {
316   BlockLatencyRegUsage,
317   BlockRegUsageLatency,
318   BlockRegUsage
319 };
320 
321 class SIScheduleBlockScheduler {
322   SIScheduleDAGMI *DAG;
323   SISchedulerBlockSchedulerVariant Variant;
324   std::vector<SIScheduleBlock*> Blocks;
325 
326   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
327   std::set<unsigned> LiveRegs;
328   // Num of schedulable unscheduled blocks reading the register.
329   std::map<unsigned, unsigned> LiveRegsConsumers;
330 
331   std::vector<unsigned> LastPosHighLatencyParentScheduled;
332   int LastPosWaitedHighLatency;
333 
334   std::vector<SIScheduleBlock*> BlocksScheduled;
335   unsigned NumBlockScheduled;
336   std::vector<SIScheduleBlock*> ReadyBlocks;
337 
338   unsigned VregCurrentUsage;
339   unsigned SregCurrentUsage;
340 
341   // Currently is only approximation.
342   unsigned maxVregUsage;
343   unsigned maxSregUsage;
344 
345   std::vector<unsigned> BlockNumPredsLeft;
346   std::vector<unsigned> BlockNumSuccsLeft;
347 
348 public:
349   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
350                            SISchedulerBlockSchedulerVariant Variant,
351                            SIScheduleBlocks BlocksStruct);
352   ~SIScheduleBlockScheduler() = default;
353 
getBlocks()354   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
355 
getVGPRUsage()356   unsigned getVGPRUsage() { return maxVregUsage; }
getSGPRUsage()357   unsigned getSGPRUsage() { return maxSregUsage; }
358 
359 private:
360   struct SIBlockSchedCandidate : SISchedulerCandidate {
361     // The best Block candidate.
362     SIScheduleBlock *Block = nullptr;
363 
364     bool IsHighLatency;
365     int VGPRUsageDiff;
366     unsigned NumSuccessors;
367     unsigned NumHighLatencySuccessors;
368     unsigned LastPosHighLatParentScheduled;
369     unsigned Height;
370 
371     SIBlockSchedCandidate() = default;
372 
isValidSIBlockSchedCandidate373     bool isValid() const { return Block; }
374 
375     // Copy the status of another candidate without changing policy.
setBestSIBlockSchedCandidate376     void setBest(SIBlockSchedCandidate &Best) {
377       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
378       Block = Best.Block;
379       Reason = Best.Reason;
380       IsHighLatency = Best.IsHighLatency;
381       VGPRUsageDiff = Best.VGPRUsageDiff;
382       NumSuccessors = Best.NumSuccessors;
383       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
384       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
385       Height = Best.Height;
386     }
387   };
388 
389   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
390                            SIBlockSchedCandidate &TryCand);
391   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
392                             SIBlockSchedCandidate &TryCand);
393   SIScheduleBlock *pickBlock();
394 
395   void addLiveRegs(std::set<unsigned> &Regs);
396   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
397   void releaseBlockSuccs(SIScheduleBlock *Parent);
398   void blockScheduled(SIScheduleBlock *Block);
399 
400   // Check register pressure change
401   // by scheduling a block with these LiveIn and LiveOut.
402   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
403                                        std::set<unsigned> &OutRegs);
404 
405   void schedule();
406 };
407 
408 struct SIScheduleBlockResult {
409   std::vector<unsigned> SUs;
410   unsigned MaxSGPRUsage;
411   unsigned MaxVGPRUsage;
412 };
413 
414 class SIScheduler {
415   SIScheduleDAGMI *DAG;
416   SIScheduleBlockCreator BlockCreator;
417 
418 public:
SIScheduler(SIScheduleDAGMI * DAG)419   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
420 
421   ~SIScheduler() = default;
422 
423   struct SIScheduleBlockResult
424   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
425                   SISchedulerBlockSchedulerVariant ScheduleVariant);
426 };
427 
428 class SIScheduleDAGMI final : public ScheduleDAGMILive {
429   const SIInstrInfo *SITII;
430   const SIRegisterInfo *SITRI;
431 
432   std::vector<SUnit> SUnitsLinksBackup;
433 
434   // For moveLowLatencies. After all Scheduling variants are tested.
435   std::vector<unsigned> ScheduledSUnits;
436   std::vector<unsigned> ScheduledSUnitsInv;
437 
438   unsigned VGPRSetID;
439   unsigned SGPRSetID;
440 
441 public:
442   SIScheduleDAGMI(MachineSchedContext *C);
443 
444   ~SIScheduleDAGMI() override;
445 
446   // Entry point for the schedule.
447   void schedule() override;
448 
449   // To init Block's RPTracker.
initRPTracker(RegPressureTracker & RPTracker)450   void initRPTracker(RegPressureTracker &RPTracker) {
451     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
452   }
453 
getBB()454   MachineBasicBlock *getBB() { return BB; }
getCurrentTop()455   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
getCurrentBottom()456   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
getLIS()457   LiveIntervals *getLIS() { return LIS; }
getMRI()458   MachineRegisterInfo *getMRI() { return &MRI; }
getTRI()459   const TargetRegisterInfo *getTRI() { return TRI; }
GetTopo()460   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
getEntrySU()461   SUnit& getEntrySU() { return EntrySU; }
getExitSU()462   SUnit& getExitSU() { return ExitSU; }
463 
464   void restoreSULinksLeft();
465 
466   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
467                                                      _Iterator End,
468                                                      unsigned &VgprUsage,
469                                                      unsigned &SgprUsage);
470 
getInRegs()471   std::set<unsigned> getInRegs() {
472     std::set<unsigned> InRegs;
473     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
474       InRegs.insert(RegMaskPair.RegUnit);
475     }
476     return InRegs;
477   }
478 
getOutRegs()479   std::set<unsigned> getOutRegs() {
480     std::set<unsigned> OutRegs;
481     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
482       OutRegs.insert(RegMaskPair.RegUnit);
483     }
484     return OutRegs;
485   };
486 
getVGPRSetID()487   unsigned getVGPRSetID() const { return VGPRSetID; }
getSGPRSetID()488   unsigned getSGPRSetID() const { return SGPRSetID; }
489 
490 private:
491   void topologicalSort();
492   // After scheduling is done, improve low latency placements.
493   void moveLowLatencies();
494 
495 public:
496   // Some stats for scheduling inside blocks.
497   std::vector<unsigned> IsLowLatencySU;
498   std::vector<unsigned> LowLatencyOffset;
499   std::vector<unsigned> IsHighLatencySU;
500   // Topological sort
501   // Maps topological index to the node number.
502   std::vector<int> TopDownIndex2SU;
503   std::vector<int> BottomUpIndex2SU;
504 };
505 
506 } // end namespace llvm
507 
508 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
509