1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27
28 using namespace llvm;
29
30 #define DEBUG_TYPE "igrouplp"
31
32 namespace {
33
34 static cl::opt<bool> EnableExactSolver(
35 "amdgpu-igrouplp-exact-solver", cl::Hidden,
36 cl::desc("Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
38 "possible."),
39 cl::init(false));
40
41 static cl::opt<unsigned> CutoffForExact(
42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43 cl::desc("The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49
50 static cl::opt<uint64_t> MaxBranchesExplored(
51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52 cl::desc("The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
54
55 static cl::opt<bool> UseCostHeur(
56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57 cl::desc("Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
63
64 // Components of the mask that determines which instruction types may be may be
65 // classified into a SchedGroup.
66 enum class SchedGroupMask {
67 NONE = 0u,
68 ALU = 1u << 0,
69 VALU = 1u << 1,
70 SALU = 1u << 2,
71 MFMA = 1u << 3,
72 VMEM = 1u << 4,
73 VMEM_READ = 1u << 5,
74 VMEM_WRITE = 1u << 6,
75 DS = 1u << 7,
76 DS_READ = 1u << 8,
77 DS_WRITE = 1u << 9,
78 TRANS = 1u << 10,
79 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80 DS_READ | DS_WRITE | TRANS,
81 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
82 };
83
84 class SchedGroup;
85
86 // InstructionRule class is used to enact a filter which determines whether or
87 // not an SU maps to a given SchedGroup. It contains complementary data
88 // structures (e.g Cache) to help those filters.
89 class InstructionRule {
90 protected:
91 const SIInstrInfo *TII;
92 unsigned SGID;
93 // A cache made available to the Filter to store SUnits for subsequent
94 // invocations of the Filter
95 std::optional<SmallVector<SUnit *, 4>> Cache;
96
97 public:
98 virtual bool
apply(const SUnit *,const ArrayRef<SUnit * >,SmallVectorImpl<SchedGroup> &)99 apply(const SUnit *, const ArrayRef<SUnit *>,
100 SmallVectorImpl<SchedGroup> &) {
101 return true;
102 };
103
InstructionRule(const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)104 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
105 bool NeedsCache = false)
106 : TII(TII), SGID(SGID) {
107 if (NeedsCache) {
108 Cache = SmallVector<SUnit *, 4>();
109 }
110 }
111
112 virtual ~InstructionRule() = default;
113 };
114
115 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
116
117 // Classify instructions into groups to enable fine tuned control over the
118 // scheduler. These groups may be more specific than current SchedModel
119 // instruction classes.
120 class SchedGroup {
121 private:
122 // Mask that defines which instruction types can be classified into this
123 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
124 // and SCHED_GROUP_BARRIER.
125 SchedGroupMask SGMask;
126
127 // Maximum number of SUnits that can be added to this group.
128 std::optional<unsigned> MaxSize;
129
130 // SchedGroups will only synchronize with other SchedGroups that have the same
131 // SyncID.
132 int SyncID = 0;
133
134 // SGID is used to map instructions to candidate SchedGroups
135 unsigned SGID;
136
137 // The different rules each instruction in this SchedGroup must conform to
138 SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
139
140 // Count of the number of created SchedGroups, used to initialize SGID.
141 static unsigned NumSchedGroups;
142
143 const SIInstrInfo *TII;
144
145 // Try to add and edge from SU A to SU B.
146 bool tryAddEdge(SUnit *A, SUnit *B);
147
148 // Use SGMask to determine whether we can classify MI as a member of this
149 // SchedGroup object.
150 bool canAddMI(const MachineInstr &MI) const;
151
152 public:
153 // Collection of SUnits that are classified as members of this group.
154 SmallVector<SUnit *, 32> Collection;
155
156 ScheduleDAGInstrs *DAG;
157
158 // Returns true if SU can be added to this SchedGroup.
159 bool canAddSU(SUnit &SU) const;
160
161 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
162 // MakePred is true, SU will be a predecessor of the SUnits in this
163 // SchedGroup, otherwise SU will be a successor.
164 void link(SUnit &SU, bool MakePred = false);
165
166 // Add DAG dependencies and track which edges are added, and the count of
167 // missed edges
168 int link(SUnit &SU, bool MakePred,
169 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
170
171 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
172 // Use the predicate to determine whether SU should be a predecessor (P =
173 // true) or a successor (P = false) of this SchedGroup.
174 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
175
176 // Add DAG dependencies such that SUnits in this group shall be ordered
177 // before SUnits in OtherGroup.
178 void link(SchedGroup &OtherGroup);
179
180 // Returns true if no more instructions may be added to this group.
isFull() const181 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
182
183 // Append a constraint that SUs must meet in order to fit into this
184 // SchedGroup. Since many rules involve the relationship between a SchedGroup
185 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
186 // time (rather than SchedGroup init time.)
addRule(std::shared_ptr<InstructionRule> NewRule)187 void addRule(std::shared_ptr<InstructionRule> NewRule) {
188 Rules.push_back(NewRule);
189 }
190
191 // Returns true if the SU matches all rules
allowedByRules(const SUnit * SU,SmallVectorImpl<SchedGroup> & SyncPipe) const192 bool allowedByRules(const SUnit *SU,
193 SmallVectorImpl<SchedGroup> &SyncPipe) const {
194 if (Rules.empty())
195 return true;
196 for (size_t I = 0; I < Rules.size(); I++) {
197 auto TheRule = Rules[I].get();
198 if (!TheRule->apply(SU, Collection, SyncPipe)) {
199 return false;
200 }
201 }
202 return true;
203 }
204
205 // Add SU to the SchedGroup.
add(SUnit & SU)206 void add(SUnit &SU) {
207 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
208 << format_hex((int)SGMask, 10, true) << " adding "
209 << *SU.getInstr());
210 Collection.push_back(&SU);
211 }
212
213 // Remove last element in the SchedGroup
pop()214 void pop() { Collection.pop_back(); }
215
216 // Identify and add all relevant SUs from the DAG to this SchedGroup.
217 void initSchedGroup();
218
219 // Add instructions to the SchedGroup bottom up starting from RIter.
220 // PipelineInstrs is a set of instructions that should not be added to the
221 // SchedGroup even when the other conditions for adding it are satisfied.
222 // RIter will be added to the SchedGroup as well, and dependencies will be
223 // added so that RIter will always be scheduled at the end of the group.
224 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
225 SUnitsToCandidateSGsMap &SyncedInstrs);
226
227 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
228
getSyncID()229 int getSyncID() { return SyncID; }
230
getSGID()231 int getSGID() { return SGID; }
232
getMask()233 SchedGroupMask getMask() { return SGMask; }
234
SchedGroup(SchedGroupMask SGMask,std::optional<unsigned> MaxSize,ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
236 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
237 : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
238 SGID = NumSchedGroups++;
239 }
240
SchedGroup(SchedGroupMask SGMask,std::optional<unsigned> MaxSize,int SyncID,ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)241 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
242 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
243 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
244 SGID = NumSchedGroups++;
245 }
246 };
247
248 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
resetEdges(SUnit & SU,ScheduleDAGInstrs * DAG)249 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
250 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
251 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
252 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
253
254 while (!SU.Preds.empty())
255 for (auto &P : SU.Preds)
256 SU.removePred(P);
257
258 while (!SU.Succs.empty())
259 for (auto &S : SU.Succs)
260 for (auto &SP : S.getSUnit()->Preds)
261 if (SP.getSUnit() == &SU)
262 S.getSUnit()->removePred(SP);
263 }
264
265 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
266 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
267
268 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
269 // in non-trivial cases. For example, if the requested pipeline is
270 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
271 // in the DAG, then we will have an instruction that can not be trivially
272 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
273 // to find a good solution to the pipeline -- a greedy algorithm and an exact
274 // algorithm. The exact algorithm has an exponential time complexity and should
275 // only be used for small sized problems or medium sized problems where an exact
276 // solution is highly desired.
277 class PipelineSolver {
278 ScheduleDAGMI *DAG;
279
280 // Instructions that can be assigned to multiple SchedGroups
281 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
282 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
283 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
284 // The current working pipeline
285 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
286 // The pipeline that has the best solution found so far
287 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
288
289 // Whether or not we actually have any SyncedInstrs to try to solve.
290 bool NeedsSolver = false;
291
292 // Compute an estimate of the size of search tree -- the true size is
293 // the product of each conflictedInst.Matches.size() across all SyncPipelines
294 unsigned computeProblemSize();
295
296 // The cost penalty of not assigning a SU to a SchedGroup
297 int MissPenalty = 0;
298
299 // Costs in terms of the number of edges we are unable to add
300 int BestCost = -1;
301 int CurrCost = 0;
302
303 // Index pointing to the conflicting instruction that is currently being
304 // fitted
305 int CurrConflInstNo = 0;
306 // Index to the pipeline that is currently being fitted
307 int CurrSyncGroupIdx = 0;
308 // The first non trivial pipeline
309 int BeginSyncGroupIdx = 0;
310
311 // How many branches we have explored
312 uint64_t BranchesExplored = 0;
313
314 // The direction in which we process the candidate SchedGroups per SU
315 bool IsBottomUp = 1;
316
317 // Update indices to fit next conflicting instruction
318 void advancePosition();
319 // Recede indices to attempt to find better fit for previous conflicting
320 // instruction
321 void retreatPosition();
322
323 // The exponential time algorithm which finds the provably best fit
324 bool solveExact();
325 // The polynomial time algorithm which attempts to find a good fit
326 bool solveGreedy();
327 // Find the best SchedGroup for the current SU using the heuristic given all
328 // current information. One step in the greedy algorithm. Templated against
329 // the SchedGroup iterator (either reverse or forward).
330 template <typename T>
331 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
332 T E);
333 // Whether or not the current solution is optimal
334 bool checkOptimal();
335 // Populate the ready list, prioiritizing fewest missed edges first
336 // Templated against the SchedGroup iterator (either reverse or forward).
337 template <typename T>
338 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
339 T E);
340 // Add edges corresponding to the SchedGroups as assigned by solver
341 void makePipeline();
342 // Link the SchedGroups in the best found pipeline.
343 // Tmplated against the SchedGroup iterator (either reverse or forward).
344 template <typename T> void linkSchedGroups(T I, T E);
345 // Add the edges from the SU to the other SchedGroups in pipeline, and
346 // return the number of edges missed.
347 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
348 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
349 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
350 /// returns the cost (in terms of missed pipeline edges), and tracks the edges
351 /// added in \p AddedEdges
352 template <typename T>
353 int linkSUnit(SUnit *SU, int SGID,
354 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
355 /// Remove the edges passed via \p AddedEdges
356 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
357 // Convert the passed in maps to arrays for bidirectional iterators
358 void convertSyncMapsToArrays();
359
360 void reset();
361
362 public:
363 // Invoke the solver to map instructions to instruction groups. Heuristic &&
364 // command-line-option determines to use exact or greedy algorithm.
365 void solve();
366
PipelineSolver(DenseMap<int,SmallVector<SchedGroup,4>> & SyncedSchedGroups,DenseMap<int,SUnitsToCandidateSGsMap> & SyncedInstrs,ScheduleDAGMI * DAG,bool IsBottomUp=1)367 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
368 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
369 ScheduleDAGMI *DAG, bool IsBottomUp = 1)
370 : DAG(DAG), SyncedInstrs(SyncedInstrs),
371 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
372
373 for (auto &PipelineInstrs : SyncedInstrs) {
374 if (PipelineInstrs.second.size() > 0) {
375 NeedsSolver = true;
376 break;
377 }
378 }
379
380 if (!NeedsSolver)
381 return;
382
383 convertSyncMapsToArrays();
384
385 CurrPipeline = BestPipeline;
386
387 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
388 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
389 ++BeginSyncGroupIdx;
390
391 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
392 return;
393 }
394 };
395
reset()396 void PipelineSolver::reset() {
397
398 for (auto &SyncPipeline : CurrPipeline) {
399 for (auto &SG : SyncPipeline) {
400 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
401 SG.Collection.clear();
402 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
403 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
404 });
405 if (SchedBarr != TempCollection.end())
406 SG.Collection.push_back(*SchedBarr);
407 }
408 }
409
410 CurrSyncGroupIdx = BeginSyncGroupIdx;
411 CurrConflInstNo = 0;
412 CurrCost = 0;
413 }
414
convertSyncMapsToArrays()415 void PipelineSolver::convertSyncMapsToArrays() {
416 for (auto &SyncPipe : SyncedSchedGroups) {
417 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
418 }
419
420 int PipelineIDx = SyncedInstrs.size() - 1;
421 PipelineInstrs.resize(SyncedInstrs.size());
422 for (auto &SyncInstrMap : SyncedInstrs) {
423 for (auto &SUsToCandSGs : SyncInstrMap.second) {
424 if (PipelineInstrs[PipelineIDx].size() == 0) {
425 PipelineInstrs[PipelineIDx].push_back(
426 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
427 continue;
428 }
429 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
430 // Insert them in sorted order -- this allows for good parsing order in
431 // the greedy algorithm
432 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
433 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
434 ++SortPosition;
435 PipelineInstrs[PipelineIDx].insert(
436 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
437 }
438 --PipelineIDx;
439 }
440 }
441
linkSchedGroups(T I,T E)442 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
443 for (; I != E; ++I) {
444 auto &GroupA = *I;
445 for (auto J = std::next(I); J != E; ++J) {
446 auto &GroupB = *J;
447 GroupA.link(GroupB);
448 }
449 }
450 }
451
makePipeline()452 void PipelineSolver::makePipeline() {
453 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
454 for (auto &SyncPipeline : BestPipeline) {
455 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
456 for (auto &SG : SyncPipeline) {
457 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
458 << " has: \n");
459 SUnit *SGBarr = nullptr;
460 for (auto &SU : SG.Collection) {
461 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
462 SGBarr = SU;
463 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
464 }
465 // Command line requested IGroupLP doesn't have SGBarr
466 if (!SGBarr)
467 continue;
468 resetEdges(*SGBarr, DAG);
469 SG.link(*SGBarr, false);
470 }
471 }
472
473 for (auto &SyncPipeline : BestPipeline) {
474 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
475 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
476 }
477 }
478
479 template <typename T>
linkSUnit(SUnit * SU,int SGID,std::vector<std::pair<SUnit *,SUnit * >> & AddedEdges,T I,T E)480 int PipelineSolver::linkSUnit(
481 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
482 T I, T E) {
483 bool MakePred = false;
484 int AddedCost = 0;
485 for (; I < E; ++I) {
486 if (I->getSGID() == SGID) {
487 MakePred = true;
488 continue;
489 }
490 auto Group = *I;
491 AddedCost += Group.link(*SU, MakePred, AddedEdges);
492 assert(AddedCost >= 0);
493 }
494 return AddedCost;
495 }
496
addEdges(SmallVectorImpl<SchedGroup> & SyncPipeline,SUnit * SU,int SGID,std::vector<std::pair<SUnit *,SUnit * >> & AddedEdges)497 int PipelineSolver::addEdges(
498 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
499 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
500
501 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
502 // instructions that are the ultimate successors in the resultant mutation.
503 // Therefore, in such a configuration, the SchedGroups occurring before the
504 // candidate SGID are successors of the candidate SchedGroup, thus the current
505 // SU should be linked as a predecessor to SUs in those SchedGroups. The
506 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
507 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
508 // IsBottomUp (in reverse).
509 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
510 SyncPipeline.rend())
511 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
512 SyncPipeline.end());
513 }
514
removeEdges(const std::vector<std::pair<SUnit *,SUnit * >> & EdgesToRemove)515 void PipelineSolver::removeEdges(
516 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
517 // Only remove the edges that we have added when testing
518 // the fit.
519 for (auto &PredSuccPair : EdgesToRemove) {
520 SUnit *Pred = PredSuccPair.first;
521 SUnit *Succ = PredSuccPair.second;
522
523 auto Match = llvm::find_if(
524 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
525 if (Match != Succ->Preds.end()) {
526 assert(Match->isArtificial());
527 Succ->removePred(*Match);
528 }
529 }
530 }
531
advancePosition()532 void PipelineSolver::advancePosition() {
533 ++CurrConflInstNo;
534
535 if (static_cast<size_t>(CurrConflInstNo) >=
536 PipelineInstrs[CurrSyncGroupIdx].size()) {
537 CurrConflInstNo = 0;
538 ++CurrSyncGroupIdx;
539 // Advance to next non-trivial pipeline
540 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
541 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
542 ++CurrSyncGroupIdx;
543 }
544 }
545
retreatPosition()546 void PipelineSolver::retreatPosition() {
547 assert(CurrConflInstNo >= 0);
548 assert(CurrSyncGroupIdx >= 0);
549
550 if (CurrConflInstNo > 0) {
551 --CurrConflInstNo;
552 return;
553 }
554
555 if (CurrConflInstNo == 0) {
556 // If we return to the starting position, we have explored
557 // the entire tree
558 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
559 return;
560
561 --CurrSyncGroupIdx;
562 // Go to previous non-trivial pipeline
563 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
564 --CurrSyncGroupIdx;
565
566 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
567 }
568 }
569
checkOptimal()570 bool PipelineSolver::checkOptimal() {
571 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
572 if (BestCost == -1 || CurrCost < BestCost) {
573 BestPipeline = CurrPipeline;
574 BestCost = CurrCost;
575 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
576 }
577 assert(BestCost >= 0);
578 }
579
580 bool DoneExploring = false;
581 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
582 DoneExploring = true;
583
584 return (DoneExploring || BestCost == 0);
585 }
586
587 template <typename T>
populateReadyList(SmallVectorImpl<std::pair<int,int>> & ReadyList,T I,T E)588 void PipelineSolver::populateReadyList(
589 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
590 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
591 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
592 assert(CurrSU.second.size() >= 1);
593
594 for (; I != E; ++I) {
595 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
596 int CandSGID = *I;
597 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
598 return SG.getSGID() == CandSGID;
599 });
600 assert(Match);
601
602 if (UseCostHeur) {
603 if (Match->isFull()) {
604 ReadyList.push_back(std::pair(*I, MissPenalty));
605 continue;
606 }
607
608 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
609 ReadyList.push_back(std::pair(*I, TempCost));
610 removeEdges(AddedEdges);
611 } else
612 ReadyList.push_back(std::pair(*I, -1));
613 }
614
615 if (UseCostHeur) {
616 std::sort(ReadyList.begin(), ReadyList.end(),
617 [](std::pair<int, int> A, std::pair<int, int> B) {
618 return A.second < B.second;
619 });
620 }
621
622 assert(ReadyList.size() == CurrSU.second.size());
623 }
624
solveExact()625 bool PipelineSolver::solveExact() {
626 if (checkOptimal())
627 return true;
628
629 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
630 return false;
631
632 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
633 assert(static_cast<size_t>(CurrConflInstNo) <
634 PipelineInstrs[CurrSyncGroupIdx].size());
635 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
636 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
637 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
638
639 // SchedGroup -> Cost pairs
640 SmallVector<std::pair<int, int>, 4> ReadyList;
641 // Prioritize the candidate sched groups in terms of lowest cost first
642 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
643 CurrSU.second.rend())
644 : populateReadyList(ReadyList, CurrSU.second.begin(),
645 CurrSU.second.end());
646
647 auto I = ReadyList.begin();
648 auto E = ReadyList.end();
649 for (; I != E; ++I) {
650 // If we are trying SGs in least cost order, and the current SG is cost
651 // infeasible, then all subsequent SGs will also be cost infeasible, so we
652 // can prune.
653 if (BestCost != -1 && (CurrCost + I->second > BestCost))
654 return false;
655
656 int CandSGID = I->first;
657 int AddedCost = 0;
658 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
659 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
660 SchedGroup *Match;
661 for (auto &SG : SyncPipeline) {
662 if (SG.getSGID() == CandSGID)
663 Match = &SG;
664 }
665
666 if (Match->isFull())
667 continue;
668
669 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
670 continue;
671
672 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
673 << (int)Match->getMask() << "and ID " << CandSGID
674 << "\n");
675 Match->add(*CurrSU.first);
676 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
677 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
678 CurrCost += AddedCost;
679 advancePosition();
680 ++BranchesExplored;
681 bool FinishedExploring = false;
682 // If the Cost after adding edges is greater than a known solution,
683 // backtrack
684 if (CurrCost < BestCost || BestCost == -1) {
685 if (solveExact()) {
686 FinishedExploring = BestCost != 0;
687 if (!FinishedExploring)
688 return true;
689 }
690 }
691
692 retreatPosition();
693 CurrCost -= AddedCost;
694 removeEdges(AddedEdges);
695 Match->pop();
696 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
697 if (FinishedExploring)
698 return true;
699 }
700
701 // Try the pipeline where the current instruction is omitted
702 // Potentially if we omit a problematic instruction from the pipeline,
703 // all the other instructions can nicely fit.
704 CurrCost += MissPenalty;
705 advancePosition();
706
707 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
708
709 bool FinishedExploring = false;
710 if (CurrCost < BestCost || BestCost == -1) {
711 if (solveExact()) {
712 bool FinishedExploring = BestCost != 0;
713 if (!FinishedExploring)
714 return true;
715 }
716 }
717
718 retreatPosition();
719 CurrCost -= MissPenalty;
720 return FinishedExploring;
721 }
722
723 template <typename T>
greedyFind(std::vector<std::pair<SUnit *,SUnit * >> & AddedEdges,T I,T E)724 void PipelineSolver::greedyFind(
725 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
726 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
727 int BestNodeCost = -1;
728 int TempCost;
729 SchedGroup *BestGroup = nullptr;
730 int BestGroupID = -1;
731 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
732 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
733 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
734
735 // Since we have added the potential SchedGroups from bottom up, but
736 // traversed the DAG from top down, parse over the groups from last to
737 // first. If we fail to do this for the greedy algorithm, the solution will
738 // likely not be good in more complex cases.
739 for (; I != E; ++I) {
740 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
741 int CandSGID = *I;
742 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
743 return SG.getSGID() == CandSGID;
744 });
745 assert(Match);
746
747 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
748 << (int)Match->getMask() << "\n");
749
750 if (Match->isFull()) {
751 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
752 continue;
753 }
754 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
755 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
756 continue;
757 }
758 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
759 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
760 if (TempCost < BestNodeCost || BestNodeCost == -1) {
761 BestGroup = Match;
762 BestNodeCost = TempCost;
763 BestGroupID = CandSGID;
764 }
765 removeEdges(AddedEdges);
766 if (BestNodeCost == 0)
767 break;
768 }
769
770 if (BestGroupID != -1) {
771 BestGroup->add(*CurrSU.first);
772 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
773 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
774 << (int)BestGroup->getMask() << "\n");
775 BestCost += TempCost;
776 } else
777 BestCost += MissPenalty;
778
779 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
780 }
781
solveGreedy()782 bool PipelineSolver::solveGreedy() {
783 BestCost = 0;
784 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
785
786 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
787 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
788 IsBottomUp
789 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
790 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
791 advancePosition();
792 }
793 BestPipeline = CurrPipeline;
794 removeEdges(AddedEdges);
795 return false;
796 }
797
computeProblemSize()798 unsigned PipelineSolver::computeProblemSize() {
799 unsigned ProblemSize = 0;
800 for (auto &PipeConflicts : PipelineInstrs) {
801 ProblemSize += PipeConflicts.size();
802 }
803
804 return ProblemSize;
805 }
806
solve()807 void PipelineSolver::solve() {
808 if (!NeedsSolver)
809 return;
810
811 unsigned ProblemSize = computeProblemSize();
812 assert(ProblemSize > 0);
813
814 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
815 MissPenalty = (ProblemSize / 2) + 1;
816
817 LLVM_DEBUG(DAG->dump());
818 if (EnableExactSolver || BelowCutoff) {
819 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
820 solveGreedy();
821 reset();
822 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
823 if (BestCost > 0) {
824 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
825 solveExact();
826 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
827 }
828 } else { // Use the Greedy Algorithm by default
829 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
830 solveGreedy();
831 }
832
833 makePipeline();
834 LLVM_DEBUG(dbgs() << "After applying mutation\n");
835 LLVM_DEBUG(DAG->dump());
836 }
837
838 enum IGLPStrategyID : int {
839 MFMASmallGemmOptID = 0,
840 MFMASmallGemmSingleWaveOptID = 1,
841 };
842
843 // Implement a IGLP scheduling strategy.
844 class IGLPStrategy {
845 protected:
846 ScheduleDAGInstrs *DAG;
847
848 const SIInstrInfo *TII;
849
850 public:
851 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
852 virtual void applyIGLPStrategy(
853 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
854 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
855 bool IsReentry) = 0;
856
857 // Returns true if this strategy should be applied to a ScheduleDAG.
858 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
859
860 bool IsBottomUp = 1;
861
IGLPStrategy(ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)862 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
863 : DAG(DAG), TII(TII) {}
864
865 virtual ~IGLPStrategy() = default;
866 };
867
868 class MFMASmallGemmOpt final : public IGLPStrategy {
869 private:
870 public:
871 void applyIGLPStrategy(
872 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
873 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
874 bool IsReentry) override;
875
shouldApplyStrategy(ScheduleDAGInstrs * DAG)876 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
877
MFMASmallGemmOpt(ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)878 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
879 : IGLPStrategy(DAG, TII) {
880 IsBottomUp = 1;
881 }
882 };
883
applyIGLPStrategy(DenseMap<int,SUnitsToCandidateSGsMap> & SyncedInstrs,DenseMap<int,SmallVector<SchedGroup,4>> & SyncedSchedGroups,bool IsReentry)884 void MFMASmallGemmOpt::applyIGLPStrategy(
885 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
886 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
887 bool IsReentry) {
888 // Count the number of MFMA instructions.
889 unsigned MFMACount = 0;
890 for (const MachineInstr &I : *DAG)
891 if (TII->isMFMAorWMMA(I))
892 ++MFMACount;
893
894 const unsigned PipelineSyncID = 0;
895 SchedGroup *SG = nullptr;
896 for (unsigned I = 0; I < MFMACount * 3; ++I) {
897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900
901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
904 }
905 }
906
907 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
908 private:
909 // Whether the DS_READ is a predecessor of first four MFMA in region
910 class EnablesInitialMFMA final : public InstructionRule {
911 public:
apply(const SUnit * SU,const ArrayRef<SUnit * > Collection,SmallVectorImpl<SchedGroup> & SyncPipe)912 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
913 SmallVectorImpl<SchedGroup> &SyncPipe) override {
914 if (!SyncPipe.size())
915 return false;
916 int MFMAsFound = 0;
917 if (!Cache->size()) {
918 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
919 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
920 ++MFMAsFound;
921 if (MFMAsFound > 4)
922 break;
923 Cache->push_back(&Elt);
924 }
925 }
926 }
927
928 assert(Cache->size());
929 auto DAG = SyncPipe[0].DAG;
930 for (auto &Elt : *Cache) {
931 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
932 return true;
933 }
934 return false;
935 }
936
EnablesInitialMFMA(const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)937 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
938 bool NeedsCache = false)
939 : InstructionRule(TII, SGID, NeedsCache) {}
940 };
941
942 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
943 class IsPermForDSW final : public InstructionRule {
944 public:
apply(const SUnit * SU,const ArrayRef<SUnit * > Collection,SmallVectorImpl<SchedGroup> & SyncPipe)945 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
946 SmallVectorImpl<SchedGroup> &SyncPipe) override {
947 auto MI = SU->getInstr();
948 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
949 return false;
950
951 bool FitsInGroup = false;
952 // Does the VALU have a DS_WRITE successor
953 if (!Collection.size()) {
954 for (auto &Succ : SU->Succs) {
955 SUnit *SuccUnit = Succ.getSUnit();
956 if (TII->isDS(*SuccUnit->getInstr()) &&
957 SuccUnit->getInstr()->mayStore()) {
958 Cache->push_back(SuccUnit);
959 FitsInGroup = true;
960 }
961 }
962 return FitsInGroup;
963 }
964
965 assert(Cache->size());
966
967 // Does the VALU have a DS_WRITE successor that is the same as other
968 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
969 return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
970 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
971 return ThisSucc.getSUnit() == Elt;
972 });
973 });
974 }
975
IsPermForDSW(const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)976 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
977 : InstructionRule(TII, SGID, NeedsCache) {}
978 };
979
980 // Whether the SU is a successor of any element in previous SchedGroup
981 class IsSuccOfPrevGroup final : public InstructionRule {
982 public:
apply(const SUnit * SU,const ArrayRef<SUnit * > Collection,SmallVectorImpl<SchedGroup> & SyncPipe)983 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
984 SmallVectorImpl<SchedGroup> &SyncPipe) override {
985 SchedGroup *OtherGroup = nullptr;
986 for (auto &PipeSG : SyncPipe) {
987 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
988 OtherGroup = &PipeSG;
989 }
990 }
991
992 if (!OtherGroup)
993 return false;
994 if (!OtherGroup->Collection.size())
995 return true;
996
997 // Does the previous VALU have this DS_Write as a successor
998 return (std::any_of(OtherGroup->Collection.begin(),
999 OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
1000 return std::any_of(Elt->Succs.begin(),
1001 Elt->Succs.end(),
1002 [&SU](SDep &Succ) {
1003 return Succ.getSUnit() == SU;
1004 });
1005 }));
1006 }
IsSuccOfPrevGroup(const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)1007 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1008 bool NeedsCache = false)
1009 : InstructionRule(TII, SGID, NeedsCache) {}
1010 };
1011
1012 // Whether the combined load width of group is 128 bits
1013 class VMEMSize final : public InstructionRule {
1014 public:
apply(const SUnit * SU,const ArrayRef<SUnit * > Collection,SmallVectorImpl<SchedGroup> & SyncPipe)1015 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1016 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1017 auto MI = SU->getInstr();
1018 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1019 return false;
1020 if (!Collection.size())
1021 return true;
1022
1023 int NumBits = 0;
1024
1025 auto TRI = TII->getRegisterInfo();
1026 auto &MRI = MI->getParent()->getParent()->getRegInfo();
1027 for (auto &Elt : Collection) {
1028 auto Op = Elt->getInstr()->getOperand(0);
1029 auto Size =
1030 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1031 NumBits += Size;
1032 }
1033
1034 if (NumBits < 128) {
1035 assert(TII->isVMEM(*MI) && MI->mayLoad());
1036 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1037 MRI, MI->getOperand(0))) <=
1038 128)
1039 return true;
1040 }
1041
1042 return false;
1043 }
1044
VMEMSize(const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)1045 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1046 : InstructionRule(TII, SGID, NeedsCache) {}
1047 };
1048
1049 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1050 /// that is \p Distance steps away
1051 class SharesPredWithPrevNthGroup final : public InstructionRule {
1052 private:
1053 unsigned Distance = 1;
1054
1055 public:
apply(const SUnit * SU,const ArrayRef<SUnit * > Collection,SmallVectorImpl<SchedGroup> & SyncPipe)1056 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1057 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1058 SchedGroup *OtherGroup = nullptr;
1059 if (!SyncPipe.size())
1060 return false;
1061
1062 if (!Cache->size()) {
1063
1064 for (auto &PipeSG : SyncPipe) {
1065 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
1066 OtherGroup = &PipeSG;
1067 }
1068 }
1069
1070 if (!OtherGroup)
1071 return false;
1072 if (!OtherGroup->Collection.size())
1073 return true;
1074
1075 for (auto &OtherEle : OtherGroup->Collection) {
1076 for (auto &Pred : OtherEle->Preds) {
1077 if (Pred.getSUnit()->getInstr()->getOpcode() ==
1078 AMDGPU::V_PERM_B32_e64)
1079 Cache->push_back(Pred.getSUnit());
1080 }
1081 }
1082
1083 // If the other group has no PERM preds, then this group won't share any
1084 if (!Cache->size())
1085 return false;
1086 }
1087
1088 auto DAG = SyncPipe[0].DAG;
1089 // Does the previous DS_WRITE share a V_PERM predecessor with this
1090 // VMEM_READ
1091 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
1092 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
1093 });
1094 }
SharesPredWithPrevNthGroup(unsigned Distance,const SIInstrInfo * TII,unsigned SGID,bool NeedsCache=false)1095 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1096 unsigned SGID, bool NeedsCache = false)
1097 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1098 };
1099
1100 public:
1101 void applyIGLPStrategy(
1102 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1103 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1104 bool IsReentry) override;
1105
shouldApplyStrategy(ScheduleDAGInstrs * DAG)1106 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
1107
MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)1108 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1109 : IGLPStrategy(DAG, TII) {
1110 IsBottomUp = 0;
1111 }
1112 };
1113
1114 static unsigned DSWCount = 0;
1115 static unsigned DSWWithPermCount = 0;
1116 static unsigned DSWWithSharedVMEMCount = 0;
1117
applyIGLPStrategy(DenseMap<int,SUnitsToCandidateSGsMap> & SyncedInstrs,DenseMap<int,SmallVector<SchedGroup,4>> & SyncedSchedGroups,bool IsReentry)1118 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
1119 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1120 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1121 bool IsReentry) {
1122 unsigned MFMACount = 0;
1123 unsigned DSRCount = 0;
1124
1125 assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
1126 DSWWithSharedVMEMCount == 0)) &&
1127 "DSWCounters should be zero in pre-RA scheduling!");
1128 SmallVector<SUnit *, 6> DSWithPerms;
1129 for (auto &SU : DAG->SUnits) {
1130 auto I = SU.getInstr();
1131 if (TII->isMFMAorWMMA(*I))
1132 ++MFMACount;
1133 else if (TII->isDS(*I)) {
1134 if (I->mayLoad())
1135 ++DSRCount;
1136 else if (I->mayStore() && !IsReentry) {
1137 ++DSWCount;
1138 for (auto Pred : SU.Preds) {
1139 if (Pred.getSUnit()->getInstr()->getOpcode() ==
1140 AMDGPU::V_PERM_B32_e64) {
1141 DSWithPerms.push_back(&SU);
1142 break;
1143 }
1144 }
1145 }
1146 }
1147 }
1148
1149 if (!IsReentry) {
1150 DSWWithPermCount = DSWithPerms.size();
1151 auto I = DSWithPerms.begin();
1152 auto E = DSWithPerms.end();
1153
1154 // Get the count of DS_WRITES with V_PERM predecessors which
1155 // have loop carried dependencies (WAR) on the same VMEM_READs.
1156 // We consider partial overlap as a miss -- in other words,
1157 // for a given DS_W, we only consider another DS_W as matching
1158 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1159 // for every V_PERM pred of this DS_W.
1160 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
1161 SmallVector<SUnit *, 6> Counted;
1162 for (; I != E; I++) {
1163 SUnit *Cand = nullptr;
1164 bool MissedAny = false;
1165 for (auto &Pred : (*I)->Preds) {
1166 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
1167 continue;
1168
1169 if (Cand && llvm::is_contained(Counted, Cand))
1170 break;
1171
1172 for (auto &Succ : Pred.getSUnit()->Succs) {
1173 auto MI = Succ.getSUnit()->getInstr();
1174 if (!TII->isVMEM(*MI) || !MI->mayLoad())
1175 continue;
1176
1177 if (MissedAny || !VMEMLookup.size()) {
1178 MissedAny = true;
1179 VMEMLookup[MI] = *I;
1180 continue;
1181 }
1182
1183 if (!VMEMLookup.contains(MI)) {
1184 MissedAny = true;
1185 VMEMLookup[MI] = *I;
1186 continue;
1187 }
1188
1189 Cand = VMEMLookup[MI];
1190 if (llvm::is_contained(Counted, Cand)) {
1191 MissedAny = true;
1192 break;
1193 }
1194 }
1195 }
1196 if (!MissedAny && Cand) {
1197 DSWWithSharedVMEMCount += 2;
1198 Counted.push_back(Cand);
1199 Counted.push_back(*I);
1200 }
1201 }
1202 }
1203
1204 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
1205 SchedGroup *SG;
1206 unsigned PipelineSyncID = 0;
1207 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
1208 if (DSWWithPermCount) {
1209 for (unsigned I = 0; I < MFMACount; I++) {
1210 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1211 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1212 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1213
1214 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1215 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
1216 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1217 }
1218 }
1219
1220 PipelineSyncID = 1;
1221 // Phase 1: Break up DS_READ and MFMA clusters.
1222 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
1223 // prefetch
1224
1225 // Make ready initial MFMA
1226 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1227 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
1228 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
1229 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1230
1231 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1232 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1233 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1234
1235 // Interleave MFMA with DS_READ prefetch
1236 for (unsigned I = 0; I < DSRCount - 4; ++I) {
1237 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1238 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
1239 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1240
1241 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1242 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1243 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1244 }
1245
1246 // Phase 2a: Loop carried dependency with V_PERM
1247 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
1248 // depend on. Interleave MFMA to keep XDL unit busy throughout.
1249 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
1250 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1251 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1252 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1253 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1254
1255 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1256 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1257 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1258 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1259
1260 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1261 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1262 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1263 1, TII, SG->getSGID(), true));
1264 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1265 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1266
1267 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1268 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1269 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1270
1271 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1272 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1273 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1274 3, TII, SG->getSGID(), true));
1275 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1276 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1277
1278 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1279 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1280 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1281 }
1282
1283 // Phase 2b: Loop carried dependency without V_PERM
1284 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
1285 // Interleave MFMA to keep XDL unit busy throughout.
1286 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
1287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1288 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1289 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1290
1291 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1292 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1293 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1294 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1295
1296 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1297 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1298 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1299 }
1300
1301 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
1302 // ultimately used by two DS_WRITE
1303 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
1304 // depend on. Interleave MFMA to keep XDL unit busy throughout.
1305
1306 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
1307 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1308 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1309 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1310 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1311
1312 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1313 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1314 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1315 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1316
1317 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1318 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1319 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1320
1321 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1322 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1323 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1324 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1325
1326 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1327 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1328 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1329 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1330
1331 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1332 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1333 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1334
1335 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1336 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1337 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1338 2, TII, SG->getSGID(), true));
1339 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1340 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1341
1342 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1343 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1344 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1345
1346 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1347 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1348 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1349 4, TII, SG->getSGID(), true));
1350 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1351 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1352
1353 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1354 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1355 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1356 }
1357 }
1358
1359 static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID,ScheduleDAGInstrs * DAG,const SIInstrInfo * TII)1360 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
1361 const SIInstrInfo *TII) {
1362 switch (ID) {
1363 case MFMASmallGemmOptID:
1364 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
1365 case MFMASmallGemmSingleWaveOptID:
1366 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
1367 }
1368
1369 llvm_unreachable("Unknown IGLPStrategyID");
1370 }
1371
1372 class IGroupLPDAGMutation : public ScheduleDAGMutation {
1373 private:
1374 const SIInstrInfo *TII;
1375
1376 ScheduleDAGMI *DAG;
1377
1378 // Organize lists of SchedGroups by their SyncID. SchedGroups /
1379 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
1380 // between then.
1381 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
1382
1383 // Used to track instructions that can be mapped to multiple sched groups
1384 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
1385
1386 // Add DAG edges that enforce SCHED_BARRIER ordering.
1387 void addSchedBarrierEdges(SUnit &SU);
1388
1389 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
1390 // not be reordered accross the SCHED_BARRIER. This is used for the base
1391 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
1392 // SCHED_BARRIER will always block all instructions that can be classified
1393 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
1394 // and may only synchronize with some SchedGroups. Returns the inverse of
1395 // Mask. SCHED_BARRIER's mask describes which instruction types should be
1396 // allowed to be scheduled across it. Invert the mask to get the
1397 // SchedGroupMask of instructions that should be barred.
1398 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
1399
1400 // Create SchedGroups for a SCHED_GROUP_BARRIER.
1401 void initSchedGroupBarrierPipelineStage(
1402 std::vector<SUnit>::reverse_iterator RIter);
1403
1404 void initIGLPOpt(SUnit &SU);
1405
1406 public:
1407 void apply(ScheduleDAGInstrs *DAGInstrs) override;
1408
1409 // The order in which the PipelineSolver should process the candidate
1410 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
1411 // created SchedGroup first, and will consider that as the ultimate
1412 // predecessor group when linking. TOP_DOWN instead links and processes the
1413 // first created SchedGroup first.
1414 bool IsBottomUp = 1;
1415
1416 // Whether or not this is a reentry into the IGroupLPDAGMutation.
1417 bool IsReentry = false;
1418
1419 IGroupLPDAGMutation() = default;
IGroupLPDAGMutation(bool IsReentry)1420 IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
1421 };
1422
1423 unsigned SchedGroup::NumSchedGroups = 0;
1424
tryAddEdge(SUnit * A,SUnit * B)1425 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
1426 if (A != B && DAG->canAddEdge(B, A)) {
1427 DAG->addEdge(B, SDep(A, SDep::Artificial));
1428 return true;
1429 }
1430 return false;
1431 }
1432
canAddMI(const MachineInstr & MI) const1433 bool SchedGroup::canAddMI(const MachineInstr &MI) const {
1434 bool Result = false;
1435 if (MI.isMetaInstruction())
1436 Result = false;
1437
1438 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
1439 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
1440 TII->isTRANS(MI)))
1441 Result = true;
1442
1443 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
1444 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
1445 Result = true;
1446
1447 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
1448 TII->isSALU(MI))
1449 Result = true;
1450
1451 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
1452 TII->isMFMAorWMMA(MI))
1453 Result = true;
1454
1455 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
1456 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1457 Result = true;
1458
1459 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
1460 MI.mayLoad() &&
1461 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1462 Result = true;
1463
1464 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
1465 MI.mayStore() &&
1466 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1467 Result = true;
1468
1469 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
1470 TII->isDS(MI))
1471 Result = true;
1472
1473 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
1474 MI.mayLoad() && TII->isDS(MI))
1475 Result = true;
1476
1477 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
1478 MI.mayStore() && TII->isDS(MI))
1479 Result = true;
1480
1481 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
1482 TII->isTRANS(MI))
1483 Result = true;
1484
1485 LLVM_DEBUG(
1486 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
1487 << (Result ? " could classify " : " unable to classify ") << MI);
1488
1489 return Result;
1490 }
1491
link(SUnit & SU,bool MakePred,std::vector<std::pair<SUnit *,SUnit * >> & AddedEdges)1492 int SchedGroup::link(SUnit &SU, bool MakePred,
1493 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
1494 int MissedEdges = 0;
1495 for (auto *A : Collection) {
1496 SUnit *B = &SU;
1497 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1498 continue;
1499 if (MakePred)
1500 std::swap(A, B);
1501
1502 if (DAG->IsReachable(B, A))
1503 continue;
1504
1505 // tryAddEdge returns false if there is a dependency that makes adding
1506 // the A->B edge impossible, otherwise it returns true;
1507 bool Added = tryAddEdge(A, B);
1508 if (Added)
1509 AddedEdges.push_back(std::pair(A, B));
1510 else
1511 ++MissedEdges;
1512 }
1513
1514 return MissedEdges;
1515 }
1516
link(SUnit & SU,bool MakePred)1517 void SchedGroup::link(SUnit &SU, bool MakePred) {
1518 for (auto *A : Collection) {
1519 SUnit *B = &SU;
1520 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1521 continue;
1522 if (MakePred)
1523 std::swap(A, B);
1524
1525 tryAddEdge(A, B);
1526 }
1527 }
1528
link(SUnit & SU,function_ref<bool (const SUnit * A,const SUnit * B)> P)1529 void SchedGroup::link(SUnit &SU,
1530 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
1531 for (auto *A : Collection) {
1532 SUnit *B = &SU;
1533 if (P(A, B))
1534 std::swap(A, B);
1535
1536 tryAddEdge(A, B);
1537 }
1538 }
1539
link(SchedGroup & OtherGroup)1540 void SchedGroup::link(SchedGroup &OtherGroup) {
1541 for (auto *B : OtherGroup.Collection)
1542 link(*B);
1543 }
1544
canAddSU(SUnit & SU) const1545 bool SchedGroup::canAddSU(SUnit &SU) const {
1546 MachineInstr &MI = *SU.getInstr();
1547 if (MI.getOpcode() != TargetOpcode::BUNDLE)
1548 return canAddMI(MI);
1549
1550 // Special case for bundled MIs.
1551 const MachineBasicBlock *MBB = MI.getParent();
1552 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
1553 while (E != MBB->end() && E->isBundledWithPred())
1554 ++E;
1555
1556 // Return true if all of the bundled MIs can be added to this group.
1557 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
1558 }
1559
initSchedGroup()1560 void SchedGroup::initSchedGroup() {
1561 for (auto &SU : DAG->SUnits) {
1562 if (isFull())
1563 break;
1564
1565 if (canAddSU(SU))
1566 add(SU);
1567 }
1568 }
1569
initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,SUnitsToCandidateSGsMap & SyncedInstrs)1570 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1571 SUnitsToCandidateSGsMap &SyncedInstrs) {
1572 SUnit &InitSU = *RIter;
1573 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1574 auto &SU = *RIter;
1575 if (isFull())
1576 break;
1577
1578 if (canAddSU(SU))
1579 SyncedInstrs[&SU].push_back(SGID);
1580 }
1581
1582 add(InitSU);
1583 assert(MaxSize);
1584 (*MaxSize)++;
1585 }
1586
initSchedGroup(SUnitsToCandidateSGsMap & SyncedInstrs)1587 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1588 auto I = DAG->SUnits.rbegin();
1589 auto E = DAG->SUnits.rend();
1590 for (; I != E; ++I) {
1591 auto &SU = *I;
1592 if (isFull())
1593 break;
1594
1595 if (canAddSU(SU))
1596 SyncedInstrs[&SU].push_back(SGID);
1597 }
1598 }
1599
apply(ScheduleDAGInstrs * DAGInstrs)1600 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
1601 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1602 if (!TSchedModel || DAGInstrs->SUnits.empty())
1603 return;
1604
1605 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
1606 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1607 TII = ST.getInstrInfo();
1608 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1609 SyncedSchedGroups.clear();
1610 SyncedInstrs.clear();
1611 bool foundSB = false;
1612 bool foundIGLP = false;
1613 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1614 unsigned Opc = R->getInstr()->getOpcode();
1615 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1616 if (Opc == AMDGPU::SCHED_BARRIER) {
1617 addSchedBarrierEdges(*R);
1618 foundSB = true;
1619 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1620 initSchedGroupBarrierPipelineStage(R);
1621 foundSB = true;
1622 } else if (Opc == AMDGPU::IGLP_OPT) {
1623 resetEdges(*R, DAG);
1624 if (!foundSB && !foundIGLP)
1625 initIGLPOpt(*R);
1626 foundIGLP = true;
1627 }
1628 }
1629
1630 if (foundSB || foundIGLP) {
1631 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
1632 // PipelineSolver performs the mutation by adding the edges it
1633 // determined as the best
1634 PS.solve();
1635 }
1636 }
1637
addSchedBarrierEdges(SUnit & SchedBarrier)1638 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1639 MachineInstr &MI = *SchedBarrier.getInstr();
1640 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1641 // Remove all existing edges from the SCHED_BARRIER that were added due to the
1642 // instruction having side effects.
1643 resetEdges(SchedBarrier, DAG);
1644 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
1645 << MI.getOperand(0).getImm() << "\n");
1646 auto InvertedMask =
1647 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1648 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1649 SG.initSchedGroup();
1650
1651 // Preserve original instruction ordering relative to the SCHED_BARRIER.
1652 SG.link(
1653 SchedBarrier,
1654 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1655 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1656 }
1657
1658 SchedGroupMask
invertSchedBarrierMask(SchedGroupMask Mask) const1659 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1660 // Invert mask and erase bits for types of instructions that are implied to be
1661 // allowed past the SCHED_BARRIER.
1662 SchedGroupMask InvertedMask = ~Mask;
1663
1664 // ALU implies VALU, SALU, MFMA, TRANS.
1665 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1666 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
1667 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
1668 // VALU, SALU, MFMA, TRANS implies ALU.
1669 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1670 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1671 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
1672 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
1673 InvertedMask &= ~SchedGroupMask::ALU;
1674
1675 // VMEM implies VMEM_READ, VMEM_WRITE.
1676 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1677 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1678 // VMEM_READ, VMEM_WRITE implies VMEM.
1679 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1680 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1681 InvertedMask &= ~SchedGroupMask::VMEM;
1682
1683 // DS implies DS_READ, DS_WRITE.
1684 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1685 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1686 // DS_READ, DS_WRITE implies DS.
1687 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1688 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1689 InvertedMask &= ~SchedGroupMask::DS;
1690
1691 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
1692 << "\n");
1693
1694 return InvertedMask;
1695 }
1696
initSchedGroupBarrierPipelineStage(std::vector<SUnit>::reverse_iterator RIter)1697 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1698 std::vector<SUnit>::reverse_iterator RIter) {
1699 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1700 // to the instruction having side effects.
1701 resetEdges(*RIter, DAG);
1702 MachineInstr &SGB = *RIter->getInstr();
1703 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1704 int32_t SGMask = SGB.getOperand(0).getImm();
1705 int32_t Size = SGB.getOperand(1).getImm();
1706 int32_t SyncID = SGB.getOperand(2).getImm();
1707
1708 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1709 Size, SyncID, DAG, TII);
1710
1711 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1712 }
1713
initIGLPOpt(SUnit & SU)1714 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1715 IGLPStrategyID StrategyID =
1716 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1717 auto S = createIGLPStrategy(StrategyID, DAG, TII);
1718 if (S->shouldApplyStrategy(DAG)) {
1719 IsBottomUp = S->IsBottomUp;
1720 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
1721 }
1722 }
1723
1724 } // namespace
1725
1726 namespace llvm {
1727
1728 /// \p IsReentry specifes whether or not this is a reentry into the
1729 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
1730 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
1731 /// scheduling "phases"), we can reenter this mutation framework more than once
1732 /// for a given region.
createIGroupLPDAGMutation(bool IsReentry)1733 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
1734 return std::make_unique<IGroupLPDAGMutation>(IsReentry);
1735 }
1736
1737 } // end namespace llvm
1738