1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "machine-scheduler"
30 
31 namespace {
32 
33 static cl::opt<bool>
34     EnableIGroupLP("amdgpu-igrouplp",
35                    cl::desc("Enable construction of Instruction Groups and "
36                             "their ordering for scheduling"),
37                    cl::init(false));
38 
39 static cl::opt<Optional<unsigned>>
40     VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
41                      cl::Hidden,
42                      cl::desc("The maximum number of instructions to include "
43                               "in VMEM group."));
44 
45 static cl::opt<Optional<unsigned>>
46     MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
47                      cl::Hidden,
48                      cl::desc("The maximum number of instructions to include "
49                               "in MFMA group."));
50 
51 static cl::opt<Optional<unsigned>>
52     LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
53                     cl::Hidden,
54                     cl::desc("The maximum number of instructions to include "
55                              "in lds/gds read group."));
56 
57 static cl::opt<Optional<unsigned>>
58     LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
59                     cl::Hidden,
60                     cl::desc("The maximum number of instructions to include "
61                              "in lds/gds write group."));
62 
63 typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
64     CanAddMIFn;
65 
66 // Classify instructions into groups to enable fine tuned control over the
67 // scheduler. These groups may be more specific than current SchedModel
68 // instruction classes.
69 class SchedGroup {
70 private:
71   // Function that returns true if a non-bundle MI may be inserted into this
72   // group.
73   const CanAddMIFn canAddMI;
74 
75   // Maximum number of SUnits that can be added to this group.
76   Optional<unsigned> MaxSize;
77 
78   // Collection of SUnits that are classified as members of this group.
79   SmallVector<SUnit *, 32> Collection;
80 
81   ScheduleDAGInstrs *DAG;
82 
83   void tryAddEdge(SUnit *A, SUnit *B) {
84     if (A != B && DAG->canAddEdge(B, A)) {
85       DAG->addEdge(B, SDep(A, SDep::Artificial));
86       LLVM_DEBUG(dbgs() << "Adding edge...\n"
87                         << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
88                         << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
89     }
90   }
91 
92 public:
93   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
94   // MakePred is true, SU will be a predecessor of the SUnits in this
95   // SchedGroup, otherwise SU will be a successor.
96   void link(SUnit &SU, bool MakePred = false) {
97     for (auto A : Collection) {
98       SUnit *B = &SU;
99       if (MakePred)
100         std::swap(A, B);
101 
102       tryAddEdge(A, B);
103     }
104   }
105 
106   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
107   // the predicate to determine whether SU should be a predecessor (P = true)
108   // or a successor (P = false) of this SchedGroup.
109   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
110     for (auto A : Collection) {
111       SUnit *B = &SU;
112       if (P(A, B))
113         std::swap(A, B);
114 
115       tryAddEdge(A, B);
116     }
117   }
118 
119   // Add DAG dependencies such that SUnits in this group shall be ordered
120   // before SUnits in OtherGroup.
121   void link(SchedGroup &OtherGroup) {
122     for (auto B : OtherGroup.Collection)
123       link(*B);
124   }
125 
126   // Returns true if no more instructions may be added to this group.
127   bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }
128 
129   // Returns true if SU can be added to this SchedGroup.
130   bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
131     if (isFull())
132       return false;
133 
134     MachineInstr &MI = *SU.getInstr();
135     if (MI.getOpcode() != TargetOpcode::BUNDLE)
136       return canAddMI(MI, TII);
137 
138     // Special case for bundled MIs.
139     const MachineBasicBlock *MBB = MI.getParent();
140     MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
141     while (E != MBB->end() && E->isBundledWithPred())
142       ++E;
143 
144     // Return true if all of the bundled MIs can be added to this group.
145     return std::all_of(
146         B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
147   }
148 
149   void add(SUnit &SU) { Collection.push_back(&SU); }
150 
151   SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
152              ScheduleDAGInstrs *DAG)
153       : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
154 };
155 
156 bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
157   return TII->isMFMA(MI);
158 }
159 
160 bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
161   return TII->isVALU(MI) && !TII->isMFMA(MI);
162 }
163 
164 bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
165   return TII->isSALU(MI);
166 }
167 
168 bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
169   return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
170 }
171 
172 bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
173   return MI.mayLoad() &&
174          (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
175 }
176 
177 bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
178   return MI.mayStore() &&
179          (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
180 }
181 
182 bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
183   return MI.mayStore() && TII->isDS(MI);
184 }
185 
186 bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
187   return MI.mayLoad() && TII->isDS(MI);
188 }
189 
190 class IGroupLPDAGMutation : public ScheduleDAGMutation {
191 public:
192   const SIInstrInfo *TII;
193   ScheduleDAGMI *DAG;
194 
195   IGroupLPDAGMutation() = default;
196   void apply(ScheduleDAGInstrs *DAGInstrs) override;
197 };
198 
199 // DAG mutation that coordinates with the SCHED_BARRIER instruction and
200 // corresponding builtin. The mutation adds edges from specific instruction
201 // classes determined by the SCHED_BARRIER mask so that they cannot be
202 // scheduled around the SCHED_BARRIER.
203 class SchedBarrierDAGMutation : public ScheduleDAGMutation {
204 private:
205   const SIInstrInfo *TII;
206 
207   ScheduleDAGMI *DAG;
208 
209   // Components of the mask that determines which instructions may not be
210   // scheduled across the SCHED_BARRIER.
211   enum class SchedBarrierMasks {
212     NONE = 0u,
213     ALU = 1u << 0,
214     VALU = 1u << 1,
215     SALU = 1u << 2,
216     MFMA = 1u << 3,
217     VMEM = 1u << 4,
218     VMEM_READ = 1u << 5,
219     VMEM_WRITE = 1u << 6,
220     DS = 1u << 7,
221     DS_READ = 1u << 8,
222     DS_WRITE = 1u << 9,
223     LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
224   };
225 
226   // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
227   // region.
228   //
229   std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
230   std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
231   std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
232   std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
233   std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
234   std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
235   std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
236 
237   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
238   // not be reordered accross the SCHED_BARRIER.
239   void getSchedGroupsFromMask(int32_t Mask,
240                               SmallVectorImpl<SchedGroup *> &SchedGroups);
241 
242   // Add DAG edges that enforce SCHED_BARRIER ordering.
243   void addSchedBarrierEdges(SUnit &SU);
244 
245   // Classify instructions and add them to the SchedGroup.
246   void initSchedGroup(SchedGroup *SG);
247 
248   // Remove all existing edges from a SCHED_BARRIER.
249   void resetSchedBarrierEdges(SUnit &SU);
250 
251 public:
252   void apply(ScheduleDAGInstrs *DAGInstrs) override;
253 
254   SchedBarrierDAGMutation() = default;
255 };
256 
257 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
258   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
259   TII = ST.getInstrInfo();
260   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
261   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
262   if (!TSchedModel || DAG->SUnits.empty())
263     return;
264 
265   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
266 
267   // The order of InstructionGroups in this vector defines the
268   // order in which edges will be added. In other words, given the
269   // present ordering, we will try to make each VMEMRead instruction
270   // a predecessor of each DSRead instruction, and so on.
271   SmallVector<SchedGroup, 4> PipelineOrderGroups = {
272       SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
273       SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
274       SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
275       SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};
276 
277   for (SUnit &SU : DAG->SUnits) {
278     LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
279     for (auto &SG : PipelineOrderGroups)
280       if (SG.canAddSU(SU, TII))
281         SG.add(SU);
282   }
283 
284   for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
285     auto &GroupA = PipelineOrderGroups[i];
286     for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
287       auto &GroupB = PipelineOrderGroups[j];
288       GroupA.link(GroupB);
289     }
290   }
291 }
292 
293 void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
294   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
295   if (!TSchedModel || DAGInstrs->SUnits.empty())
296     return;
297 
298   LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
299 
300   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
301   TII = ST.getInstrInfo();
302   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
303   for (auto &SU : DAG->SUnits)
304     if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
305       addSchedBarrierEdges(SU);
306 }
307 
308 void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
309   MachineInstr &MI = *SchedBarrier.getInstr();
310   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
311   // Remove all existing edges from the SCHED_BARRIER that were added due to the
312   // instruction having side effects.
313   resetSchedBarrierEdges(SchedBarrier);
314   SmallVector<SchedGroup *, 4> SchedGroups;
315   int32_t Mask = MI.getOperand(0).getImm();
316   getSchedGroupsFromMask(Mask, SchedGroups);
317   for (auto SG : SchedGroups)
318     SG->link(
319         SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
320                           const SUnit *A, const SUnit *B) {
321           return A->NodeNum > B->NodeNum;
322         });
323 }
324 
325 void SchedBarrierDAGMutation::getSchedGroupsFromMask(
326     int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
327   SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
328   // See IntrinsicsAMDGPU.td for an explanation of these masks and their
329   // mappings.
330   //
331   if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
332       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
333     if (!VALUSchedGroup) {
334       VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
335       initSchedGroup(VALUSchedGroup.get());
336     }
337 
338     SchedGroups.push_back(VALUSchedGroup.get());
339   }
340 
341   if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
342       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
343     if (!SALUSchedGroup) {
344       SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
345       initSchedGroup(SALUSchedGroup.get());
346     }
347 
348     SchedGroups.push_back(SALUSchedGroup.get());
349   }
350 
351   if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
352       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
353     if (!MFMASchedGroup) {
354       MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
355       initSchedGroup(MFMASchedGroup.get());
356     }
357 
358     SchedGroups.push_back(MFMASchedGroup.get());
359   }
360 
361   if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
362       (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
363     if (!VMEMReadSchedGroup) {
364       VMEMReadSchedGroup =
365           std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
366       initSchedGroup(VMEMReadSchedGroup.get());
367     }
368 
369     SchedGroups.push_back(VMEMReadSchedGroup.get());
370   }
371 
372   if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
373       (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
374     if (!VMEMWriteSchedGroup) {
375       VMEMWriteSchedGroup =
376           std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
377       initSchedGroup(VMEMWriteSchedGroup.get());
378     }
379 
380     SchedGroups.push_back(VMEMWriteSchedGroup.get());
381   }
382 
383   if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
384       (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
385     if (!DSReadSchedGroup) {
386       DSReadSchedGroup =
387           std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
388       initSchedGroup(DSReadSchedGroup.get());
389     }
390 
391     SchedGroups.push_back(DSReadSchedGroup.get());
392   }
393 
394   if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
395       (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
396     if (!DSWriteSchedGroup) {
397       DSWriteSchedGroup =
398           std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
399       initSchedGroup(DSWriteSchedGroup.get());
400     }
401 
402     SchedGroups.push_back(DSWriteSchedGroup.get());
403   }
404 }
405 
406 void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
407   assert(SG);
408   for (auto &SU : DAG->SUnits)
409     if (SG->canAddSU(SU, TII))
410       SG->add(SU);
411 }
412 
413 void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
414   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
415   for (auto &P : SU.Preds)
416     SU.removePred(P);
417 
418   for (auto &S : SU.Succs) {
419     for (auto &SP : S.getSUnit()->Preds) {
420       if (SP.getSUnit() == &SU) {
421         S.getSUnit()->removePred(SP);
422       }
423     }
424   }
425 }
426 
427 } // namespace
428 
429 namespace llvm {
430 
431 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
432   return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
433 }
434 
435 std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
436   return std::make_unique<SchedBarrierDAGMutation>();
437 }
438 
439 } // end namespace llvm
440