1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // MachineScheduler schedules machine instructions after phi elimination. It
10 // preserves LiveIntervals so it can be invoked before register allocation.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/CodeGen/MachineScheduler.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/BitVector.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/PriorityQueue.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/LiveInterval.h"
25 #include "llvm/CodeGen/LiveIntervals.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineDominators.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineFunctionPass.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/CodeGen/MachinePassRegistry.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterClassInfo.h"
36 #include "llvm/CodeGen/RegisterPressure.h"
37 #include "llvm/CodeGen/ScheduleDAG.h"
38 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
39 #include "llvm/CodeGen/ScheduleDAGMutation.h"
40 #include "llvm/CodeGen/ScheduleDFS.h"
41 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
42 #include "llvm/CodeGen/SlotIndexes.h"
43 #include "llvm/CodeGen/TargetFrameLowering.h"
44 #include "llvm/CodeGen/TargetInstrInfo.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/CodeGen/TargetRegisterInfo.h"
48 #include "llvm/CodeGen/TargetSchedule.h"
49 #include "llvm/CodeGen/TargetSubtargetInfo.h"
50 #include "llvm/Config/llvm-config.h"
51 #include "llvm/InitializePasses.h"
52 #include "llvm/MC/LaneBitmask.h"
53 #include "llvm/Pass.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Compiler.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/ErrorHandling.h"
58 #include "llvm/Support/GraphWriter.h"
59 #include "llvm/Support/MachineValueType.h"
60 #include "llvm/Support/raw_ostream.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <limits>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71
72 using namespace llvm;
73
74 #define DEBUG_TYPE "machine-scheduler"
75
76 STATISTIC(NumClustered, "Number of load/store pairs clustered");
77
78 namespace llvm {
79
80 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
81 cl::desc("Force top-down list scheduling"));
82 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
83 cl::desc("Force bottom-up list scheduling"));
84 cl::opt<bool>
85 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
86 cl::desc("Print critical path length to stdout"));
87
88 cl::opt<bool> VerifyScheduling(
89 "verify-misched", cl::Hidden,
90 cl::desc("Verify machine instrs before and after machine scheduling"));
91
92 #ifndef NDEBUG
93 cl::opt<bool> ViewMISchedDAGs(
94 "view-misched-dags", cl::Hidden,
95 cl::desc("Pop up a window to show MISched dags after they are processed"));
96 cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
97 cl::desc("Print schedule DAGs"));
98 cl::opt<bool> MISchedDumpReservedCycles(
99 "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
100 cl::desc("Dump resource usage at schedule boundary."));
101 #else
102 const bool ViewMISchedDAGs = false;
103 const bool PrintDAGs = false;
104 #ifdef LLVM_ENABLE_DUMP
105 const bool MISchedDumpReservedCycles = false;
106 #endif // LLVM_ENABLE_DUMP
107 #endif // NDEBUG
108
109 } // end namespace llvm
110
111 #ifndef NDEBUG
112 /// In some situations a few uninteresting nodes depend on nearly all other
113 /// nodes in the graph, provide a cutoff to hide them.
114 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
115 cl::desc("Hide nodes with more predecessor/successor than cutoff"));
116
117 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
118 cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
119
120 static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
121 cl::desc("Only schedule this function"));
122 static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
123 cl::desc("Only schedule this MBB#"));
124 #endif // NDEBUG
125
126 /// Avoid quadratic complexity in unusually large basic blocks by limiting the
127 /// size of the ready lists.
128 static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
129 cl::desc("Limit ready list to N instructions"), cl::init(256));
130
131 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
132 cl::desc("Enable register pressure scheduling."), cl::init(true));
133
134 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
135 cl::desc("Enable cyclic critical path analysis."), cl::init(true));
136
137 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
138 cl::desc("Enable memop clustering."),
139 cl::init(true));
140 static cl::opt<bool>
141 ForceFastCluster("force-fast-cluster", cl::Hidden,
142 cl::desc("Switch to fast cluster algorithm with the lost "
143 "of some fusion opportunities"),
144 cl::init(false));
145 static cl::opt<unsigned>
146 FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
147 cl::desc("The threshold for fast cluster"),
148 cl::init(1000));
149
150 // DAG subtrees must have at least this many nodes.
151 static const unsigned MinSubtreeSize = 8;
152
153 // Pin the vtables to this file.
anchor()154 void MachineSchedStrategy::anchor() {}
155
anchor()156 void ScheduleDAGMutation::anchor() {}
157
158 //===----------------------------------------------------------------------===//
159 // Machine Instruction Scheduling Pass and Registry
160 //===----------------------------------------------------------------------===//
161
MachineSchedContext()162 MachineSchedContext::MachineSchedContext() {
163 RegClassInfo = new RegisterClassInfo();
164 }
165
~MachineSchedContext()166 MachineSchedContext::~MachineSchedContext() {
167 delete RegClassInfo;
168 }
169
170 namespace {
171
172 /// Base class for a machine scheduler class that can run at any point.
173 class MachineSchedulerBase : public MachineSchedContext,
174 public MachineFunctionPass {
175 public:
MachineSchedulerBase(char & ID)176 MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
177
178 void print(raw_ostream &O, const Module* = nullptr) const override;
179
180 protected:
181 void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
182 };
183
184 /// MachineScheduler runs after coalescing and before register allocation.
185 class MachineScheduler : public MachineSchedulerBase {
186 public:
187 MachineScheduler();
188
189 void getAnalysisUsage(AnalysisUsage &AU) const override;
190
191 bool runOnMachineFunction(MachineFunction&) override;
192
193 static char ID; // Class identification, replacement for typeinfo
194
195 protected:
196 ScheduleDAGInstrs *createMachineScheduler();
197 };
198
199 /// PostMachineScheduler runs after shortly before code emission.
200 class PostMachineScheduler : public MachineSchedulerBase {
201 public:
202 PostMachineScheduler();
203
204 void getAnalysisUsage(AnalysisUsage &AU) const override;
205
206 bool runOnMachineFunction(MachineFunction&) override;
207
208 static char ID; // Class identification, replacement for typeinfo
209
210 protected:
211 ScheduleDAGInstrs *createPostMachineScheduler();
212 };
213
214 } // end anonymous namespace
215
216 char MachineScheduler::ID = 0;
217
218 char &llvm::MachineSchedulerID = MachineScheduler::ID;
219
220 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
221 "Machine Instruction Scheduler", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)222 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
223 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
224 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
225 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
226 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
227 INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
228 "Machine Instruction Scheduler", false, false)
229
230 MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
231 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
232 }
233
getAnalysisUsage(AnalysisUsage & AU) const234 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
235 AU.setPreservesCFG();
236 AU.addRequired<MachineDominatorTree>();
237 AU.addRequired<MachineLoopInfo>();
238 AU.addRequired<AAResultsWrapperPass>();
239 AU.addRequired<TargetPassConfig>();
240 AU.addRequired<SlotIndexes>();
241 AU.addPreserved<SlotIndexes>();
242 AU.addRequired<LiveIntervals>();
243 AU.addPreserved<LiveIntervals>();
244 MachineFunctionPass::getAnalysisUsage(AU);
245 }
246
247 char PostMachineScheduler::ID = 0;
248
249 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
250
251 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
252 "PostRA Machine Instruction Scheduler", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)253 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
254 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
255 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
256 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
257 "PostRA Machine Instruction Scheduler", false, false)
258
259 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
260 initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
261 }
262
getAnalysisUsage(AnalysisUsage & AU) const263 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
264 AU.setPreservesCFG();
265 AU.addRequired<MachineDominatorTree>();
266 AU.addRequired<MachineLoopInfo>();
267 AU.addRequired<AAResultsWrapperPass>();
268 AU.addRequired<TargetPassConfig>();
269 MachineFunctionPass::getAnalysisUsage(AU);
270 }
271
272 MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
273 MachineSchedRegistry::Registry;
274
275 /// A dummy default scheduler factory indicates whether the scheduler
276 /// is overridden on the command line.
useDefaultMachineSched(MachineSchedContext * C)277 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
278 return nullptr;
279 }
280
281 /// MachineSchedOpt allows command line selection of the scheduler.
282 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
283 RegisterPassParser<MachineSchedRegistry>>
284 MachineSchedOpt("misched",
285 cl::init(&useDefaultMachineSched), cl::Hidden,
286 cl::desc("Machine instruction scheduler to use"));
287
288 static MachineSchedRegistry
289 DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
290 useDefaultMachineSched);
291
292 static cl::opt<bool> EnableMachineSched(
293 "enable-misched",
294 cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
295 cl::Hidden);
296
297 static cl::opt<bool> EnablePostRAMachineSched(
298 "enable-post-misched",
299 cl::desc("Enable the post-ra machine instruction scheduling pass."),
300 cl::init(true), cl::Hidden);
301
302 /// Decrement this iterator until reaching the top or a non-debug instr.
303 static MachineBasicBlock::const_iterator
priorNonDebug(MachineBasicBlock::const_iterator I,MachineBasicBlock::const_iterator Beg)304 priorNonDebug(MachineBasicBlock::const_iterator I,
305 MachineBasicBlock::const_iterator Beg) {
306 assert(I != Beg && "reached the top of the region, cannot decrement");
307 while (--I != Beg) {
308 if (!I->isDebugOrPseudoInstr())
309 break;
310 }
311 return I;
312 }
313
314 /// Non-const version.
315 static MachineBasicBlock::iterator
priorNonDebug(MachineBasicBlock::iterator I,MachineBasicBlock::const_iterator Beg)316 priorNonDebug(MachineBasicBlock::iterator I,
317 MachineBasicBlock::const_iterator Beg) {
318 return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)
319 .getNonConstIterator();
320 }
321
322 /// If this iterator is a debug value, increment until reaching the End or a
323 /// non-debug instruction.
324 static MachineBasicBlock::const_iterator
nextIfDebug(MachineBasicBlock::const_iterator I,MachineBasicBlock::const_iterator End)325 nextIfDebug(MachineBasicBlock::const_iterator I,
326 MachineBasicBlock::const_iterator End) {
327 for(; I != End; ++I) {
328 if (!I->isDebugOrPseudoInstr())
329 break;
330 }
331 return I;
332 }
333
334 /// Non-const version.
335 static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,MachineBasicBlock::const_iterator End)336 nextIfDebug(MachineBasicBlock::iterator I,
337 MachineBasicBlock::const_iterator End) {
338 return nextIfDebug(MachineBasicBlock::const_iterator(I), End)
339 .getNonConstIterator();
340 }
341
342 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
createMachineScheduler()343 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
344 // Select the scheduler, or set the default.
345 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
346 if (Ctor != useDefaultMachineSched)
347 return Ctor(this);
348
349 // Get the default scheduler set by the target for this function.
350 ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
351 if (Scheduler)
352 return Scheduler;
353
354 // Default to GenericScheduler.
355 return createGenericSchedLive(this);
356 }
357
358 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
359 /// the caller. We don't have a command line option to override the postRA
360 /// scheduler. The Target must configure it.
createPostMachineScheduler()361 ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
362 // Get the postRA scheduler set by the target for this function.
363 ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
364 if (Scheduler)
365 return Scheduler;
366
367 // Default to GenericScheduler.
368 return createGenericSchedPostRA(this);
369 }
370
371 /// Top-level MachineScheduler pass driver.
372 ///
373 /// Visit blocks in function order. Divide each block into scheduling regions
374 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is
375 /// consistent with the DAG builder, which traverses the interior of the
376 /// scheduling regions bottom-up.
377 ///
378 /// This design avoids exposing scheduling boundaries to the DAG builder,
379 /// simplifying the DAG builder's support for "special" target instructions.
380 /// At the same time the design allows target schedulers to operate across
381 /// scheduling boundaries, for example to bundle the boundary instructions
382 /// without reordering them. This creates complexity, because the target
383 /// scheduler must update the RegionBegin and RegionEnd positions cached by
384 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
385 /// design would be to split blocks at scheduling boundaries, but LLVM has a
386 /// general bias against block splitting purely for implementation simplicity.
runOnMachineFunction(MachineFunction & mf)387 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
388 if (skipFunction(mf.getFunction()))
389 return false;
390
391 if (EnableMachineSched.getNumOccurrences()) {
392 if (!EnableMachineSched)
393 return false;
394 } else if (!mf.getSubtarget().enableMachineScheduler())
395 return false;
396
397 LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
398
399 // Initialize the context of the pass.
400 MF = &mf;
401 MLI = &getAnalysis<MachineLoopInfo>();
402 MDT = &getAnalysis<MachineDominatorTree>();
403 PassConfig = &getAnalysis<TargetPassConfig>();
404 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
405
406 LIS = &getAnalysis<LiveIntervals>();
407
408 if (VerifyScheduling) {
409 LLVM_DEBUG(LIS->dump());
410 MF->verify(this, "Before machine scheduling.");
411 }
412 RegClassInfo->runOnMachineFunction(*MF);
413
414 // Instantiate the selected scheduler for this target, function, and
415 // optimization level.
416 std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
417 scheduleRegions(*Scheduler, false);
418
419 LLVM_DEBUG(LIS->dump());
420 if (VerifyScheduling)
421 MF->verify(this, "After machine scheduling.");
422 return true;
423 }
424
runOnMachineFunction(MachineFunction & mf)425 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
426 if (skipFunction(mf.getFunction()))
427 return false;
428
429 if (EnablePostRAMachineSched.getNumOccurrences()) {
430 if (!EnablePostRAMachineSched)
431 return false;
432 } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) {
433 LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
434 return false;
435 }
436 LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
437
438 // Initialize the context of the pass.
439 MF = &mf;
440 MLI = &getAnalysis<MachineLoopInfo>();
441 PassConfig = &getAnalysis<TargetPassConfig>();
442 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
443
444 if (VerifyScheduling)
445 MF->verify(this, "Before post machine scheduling.");
446
447 // Instantiate the selected scheduler for this target, function, and
448 // optimization level.
449 std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
450 scheduleRegions(*Scheduler, true);
451
452 if (VerifyScheduling)
453 MF->verify(this, "After post machine scheduling.");
454 return true;
455 }
456
457 /// Return true of the given instruction should not be included in a scheduling
458 /// region.
459 ///
460 /// MachineScheduler does not currently support scheduling across calls. To
461 /// handle calls, the DAG builder needs to be modified to create register
462 /// anti/output dependencies on the registers clobbered by the call's regmask
463 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents
464 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
465 /// the boundary, but there would be no benefit to postRA scheduling across
466 /// calls this late anyway.
isSchedBoundary(MachineBasicBlock::iterator MI,MachineBasicBlock * MBB,MachineFunction * MF,const TargetInstrInfo * TII)467 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
468 MachineBasicBlock *MBB,
469 MachineFunction *MF,
470 const TargetInstrInfo *TII) {
471 return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
472 }
473
474 /// A region of an MBB for scheduling.
475 namespace {
476 struct SchedRegion {
477 /// RegionBegin is the first instruction in the scheduling region, and
478 /// RegionEnd is either MBB->end() or the scheduling boundary after the
479 /// last instruction in the scheduling region. These iterators cannot refer
480 /// to instructions outside of the identified scheduling region because
481 /// those may be reordered before scheduling this region.
482 MachineBasicBlock::iterator RegionBegin;
483 MachineBasicBlock::iterator RegionEnd;
484 unsigned NumRegionInstrs;
485
SchedRegion__anon6ea857550211::SchedRegion486 SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
487 unsigned N) :
488 RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
489 };
490 } // end anonymous namespace
491
492 using MBBRegionsVector = SmallVector<SchedRegion, 16>;
493
494 static void
getSchedRegions(MachineBasicBlock * MBB,MBBRegionsVector & Regions,bool RegionsTopDown)495 getSchedRegions(MachineBasicBlock *MBB,
496 MBBRegionsVector &Regions,
497 bool RegionsTopDown) {
498 MachineFunction *MF = MBB->getParent();
499 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
500
501 MachineBasicBlock::iterator I = nullptr;
502 for(MachineBasicBlock::iterator RegionEnd = MBB->end();
503 RegionEnd != MBB->begin(); RegionEnd = I) {
504
505 // Avoid decrementing RegionEnd for blocks with no terminator.
506 if (RegionEnd != MBB->end() ||
507 isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
508 --RegionEnd;
509 }
510
511 // The next region starts above the previous region. Look backward in the
512 // instruction stream until we find the nearest boundary.
513 unsigned NumRegionInstrs = 0;
514 I = RegionEnd;
515 for (;I != MBB->begin(); --I) {
516 MachineInstr &MI = *std::prev(I);
517 if (isSchedBoundary(&MI, &*MBB, MF, TII))
518 break;
519 if (!MI.isDebugOrPseudoInstr()) {
520 // MBB::size() uses instr_iterator to count. Here we need a bundle to
521 // count as a single instruction.
522 ++NumRegionInstrs;
523 }
524 }
525
526 // It's possible we found a scheduling region that only has debug
527 // instructions. Don't bother scheduling these.
528 if (NumRegionInstrs != 0)
529 Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
530 }
531
532 if (RegionsTopDown)
533 std::reverse(Regions.begin(), Regions.end());
534 }
535
536 /// Main driver for both MachineScheduler and PostMachineScheduler.
scheduleRegions(ScheduleDAGInstrs & Scheduler,bool FixKillFlags)537 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
538 bool FixKillFlags) {
539 // Visit all machine basic blocks.
540 //
541 // TODO: Visit blocks in global postorder or postorder within the bottom-up
542 // loop tree. Then we can optionally compute global RegPressure.
543 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
544 MBB != MBBEnd; ++MBB) {
545
546 Scheduler.startBlock(&*MBB);
547
548 #ifndef NDEBUG
549 if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
550 continue;
551 if (SchedOnlyBlock.getNumOccurrences()
552 && (int)SchedOnlyBlock != MBB->getNumber())
553 continue;
554 #endif
555
556 // Break the block into scheduling regions [I, RegionEnd). RegionEnd
557 // points to the scheduling boundary at the bottom of the region. The DAG
558 // does not include RegionEnd, but the region does (i.e. the next
559 // RegionEnd is above the previous RegionBegin). If the current block has
560 // no terminator then RegionEnd == MBB->end() for the bottom region.
561 //
562 // All the regions of MBB are first found and stored in MBBRegions, which
563 // will be processed (MBB) top-down if initialized with true.
564 //
565 // The Scheduler may insert instructions during either schedule() or
566 // exitRegion(), even for empty regions. So the local iterators 'I' and
567 // 'RegionEnd' are invalid across these calls. Instructions must not be
568 // added to other regions than the current one without updating MBBRegions.
569
570 MBBRegionsVector MBBRegions;
571 getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
572 for (const SchedRegion &R : MBBRegions) {
573 MachineBasicBlock::iterator I = R.RegionBegin;
574 MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
575 unsigned NumRegionInstrs = R.NumRegionInstrs;
576
577 // Notify the scheduler of the region, even if we may skip scheduling
578 // it. Perhaps it still needs to be bundled.
579 Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
580
581 // Skip empty scheduling regions (0 or 1 schedulable instructions).
582 if (I == RegionEnd || I == std::prev(RegionEnd)) {
583 // Close the current region. Bundle the terminator if needed.
584 // This invalidates 'RegionEnd' and 'I'.
585 Scheduler.exitRegion();
586 continue;
587 }
588 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
589 LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
590 << " " << MBB->getName() << "\n From: " << *I
591 << " To: ";
592 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
593 else dbgs() << "End\n";
594 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
595 if (DumpCriticalPathLength) {
596 errs() << MF->getName();
597 errs() << ":%bb. " << MBB->getNumber();
598 errs() << " " << MBB->getName() << " \n";
599 }
600
601 // Schedule a region: possibly reorder instructions.
602 // This invalidates the original region iterators.
603 Scheduler.schedule();
604
605 // Close the current region.
606 Scheduler.exitRegion();
607 }
608 Scheduler.finishBlock();
609 // FIXME: Ideally, no further passes should rely on kill flags. However,
610 // thumb2 size reduction is currently an exception, so the PostMIScheduler
611 // needs to do this.
612 if (FixKillFlags)
613 Scheduler.fixupKills(*MBB);
614 }
615 Scheduler.finalizeSchedule();
616 }
617
print(raw_ostream & O,const Module * m) const618 void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
619 // unimplemented
620 }
621
622 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const623 LLVM_DUMP_METHOD void ReadyQueue::dump() const {
624 dbgs() << "Queue " << Name << ": ";
625 for (const SUnit *SU : Queue)
626 dbgs() << SU->NodeNum << " ";
627 dbgs() << "\n";
628 }
629 #endif
630
631 //===----------------------------------------------------------------------===//
632 // ScheduleDAGMI - Basic machine instruction scheduling. This is
633 // independent of PreRA/PostRA scheduling and involves no extra book-keeping for
634 // virtual registers.
635 // ===----------------------------------------------------------------------===/
636
637 // Provide a vtable anchor.
638 ScheduleDAGMI::~ScheduleDAGMI() = default;
639
640 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
641 /// NumPredsLeft reaches zero, release the successor node.
642 ///
643 /// FIXME: Adjust SuccSU height based on MinLatency.
releaseSucc(SUnit * SU,SDep * SuccEdge)644 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
645 SUnit *SuccSU = SuccEdge->getSUnit();
646
647 if (SuccEdge->isWeak()) {
648 --SuccSU->WeakPredsLeft;
649 if (SuccEdge->isCluster())
650 NextClusterSucc = SuccSU;
651 return;
652 }
653 #ifndef NDEBUG
654 if (SuccSU->NumPredsLeft == 0) {
655 dbgs() << "*** Scheduling failed! ***\n";
656 dumpNode(*SuccSU);
657 dbgs() << " has been released too many times!\n";
658 llvm_unreachable(nullptr);
659 }
660 #endif
661 // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
662 // CurrCycle may have advanced since then.
663 if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
664 SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
665
666 --SuccSU->NumPredsLeft;
667 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
668 SchedImpl->releaseTopNode(SuccSU);
669 }
670
671 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
releaseSuccessors(SUnit * SU)672 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
673 for (SDep &Succ : SU->Succs)
674 releaseSucc(SU, &Succ);
675 }
676
677 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
678 /// NumSuccsLeft reaches zero, release the predecessor node.
679 ///
680 /// FIXME: Adjust PredSU height based on MinLatency.
releasePred(SUnit * SU,SDep * PredEdge)681 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
682 SUnit *PredSU = PredEdge->getSUnit();
683
684 if (PredEdge->isWeak()) {
685 --PredSU->WeakSuccsLeft;
686 if (PredEdge->isCluster())
687 NextClusterPred = PredSU;
688 return;
689 }
690 #ifndef NDEBUG
691 if (PredSU->NumSuccsLeft == 0) {
692 dbgs() << "*** Scheduling failed! ***\n";
693 dumpNode(*PredSU);
694 dbgs() << " has been released too many times!\n";
695 llvm_unreachable(nullptr);
696 }
697 #endif
698 // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
699 // CurrCycle may have advanced since then.
700 if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
701 PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
702
703 --PredSU->NumSuccsLeft;
704 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
705 SchedImpl->releaseBottomNode(PredSU);
706 }
707
708 /// releasePredecessors - Call releasePred on each of SU's predecessors.
releasePredecessors(SUnit * SU)709 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
710 for (SDep &Pred : SU->Preds)
711 releasePred(SU, &Pred);
712 }
713
startBlock(MachineBasicBlock * bb)714 void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) {
715 ScheduleDAGInstrs::startBlock(bb);
716 SchedImpl->enterMBB(bb);
717 }
718
finishBlock()719 void ScheduleDAGMI::finishBlock() {
720 SchedImpl->leaveMBB();
721 ScheduleDAGInstrs::finishBlock();
722 }
723
724 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
725 /// crossing a scheduling boundary. [begin, end) includes all instructions in
726 /// the region, including the boundary itself and single-instruction regions
727 /// that don't get scheduled.
enterRegion(MachineBasicBlock * bb,MachineBasicBlock::iterator begin,MachineBasicBlock::iterator end,unsigned regioninstrs)728 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
729 MachineBasicBlock::iterator begin,
730 MachineBasicBlock::iterator end,
731 unsigned regioninstrs)
732 {
733 ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
734
735 SchedImpl->initPolicy(begin, end, regioninstrs);
736 }
737
738 /// This is normally called from the main scheduler loop but may also be invoked
739 /// by the scheduling strategy to perform additional code motion.
moveInstruction(MachineInstr * MI,MachineBasicBlock::iterator InsertPos)740 void ScheduleDAGMI::moveInstruction(
741 MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
742 // Advance RegionBegin if the first instruction moves down.
743 if (&*RegionBegin == MI)
744 ++RegionBegin;
745
746 // Update the instruction stream.
747 BB->splice(InsertPos, BB, MI);
748
749 // Update LiveIntervals
750 if (LIS)
751 LIS->handleMove(*MI, /*UpdateFlags=*/true);
752
753 // Recede RegionBegin if an instruction moves above the first.
754 if (RegionBegin == InsertPos)
755 RegionBegin = MI;
756 }
757
checkSchedLimit()758 bool ScheduleDAGMI::checkSchedLimit() {
759 #if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG)
760 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
761 CurrentTop = CurrentBottom;
762 return false;
763 }
764 ++NumInstrsScheduled;
765 #endif
766 return true;
767 }
768
769 /// Per-region scheduling driver, called back from
770 /// MachineScheduler::runOnMachineFunction. This is a simplified driver that
771 /// does not consider liveness or register pressure. It is useful for PostRA
772 /// scheduling and potentially other custom schedulers.
schedule()773 void ScheduleDAGMI::schedule() {
774 LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
775 LLVM_DEBUG(SchedImpl->dumpPolicy());
776
777 // Build the DAG.
778 buildSchedGraph(AA);
779
780 postprocessDAG();
781
782 SmallVector<SUnit*, 8> TopRoots, BotRoots;
783 findRootsAndBiasEdges(TopRoots, BotRoots);
784
785 LLVM_DEBUG(dump());
786 if (PrintDAGs) dump();
787 if (ViewMISchedDAGs) viewGraph();
788
789 // Initialize the strategy before modifying the DAG.
790 // This may initialize a DFSResult to be used for queue priority.
791 SchedImpl->initialize(this);
792
793 // Initialize ready queues now that the DAG and priority data are finalized.
794 initQueues(TopRoots, BotRoots);
795
796 bool IsTopNode = false;
797 while (true) {
798 LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
799 SUnit *SU = SchedImpl->pickNode(IsTopNode);
800 if (!SU) break;
801
802 assert(!SU->isScheduled && "Node already scheduled");
803 if (!checkSchedLimit())
804 break;
805
806 MachineInstr *MI = SU->getInstr();
807 if (IsTopNode) {
808 assert(SU->isTopReady() && "node still has unscheduled dependencies");
809 if (&*CurrentTop == MI)
810 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
811 else
812 moveInstruction(MI, CurrentTop);
813 } else {
814 assert(SU->isBottomReady() && "node still has unscheduled dependencies");
815 MachineBasicBlock::iterator priorII =
816 priorNonDebug(CurrentBottom, CurrentTop);
817 if (&*priorII == MI)
818 CurrentBottom = priorII;
819 else {
820 if (&*CurrentTop == MI)
821 CurrentTop = nextIfDebug(++CurrentTop, priorII);
822 moveInstruction(MI, CurrentBottom);
823 CurrentBottom = MI;
824 }
825 }
826 // Notify the scheduling strategy before updating the DAG.
827 // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
828 // runs, it can then use the accurate ReadyCycle time to determine whether
829 // newly released nodes can move to the readyQ.
830 SchedImpl->schedNode(SU, IsTopNode);
831
832 updateQueues(SU, IsTopNode);
833 }
834 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
835
836 placeDebugValues();
837
838 LLVM_DEBUG({
839 dbgs() << "*** Final schedule for "
840 << printMBBReference(*begin()->getParent()) << " ***\n";
841 dumpSchedule();
842 dbgs() << '\n';
843 });
844 }
845
846 /// Apply each ScheduleDAGMutation step in order.
postprocessDAG()847 void ScheduleDAGMI::postprocessDAG() {
848 for (auto &m : Mutations)
849 m->apply(this);
850 }
851
852 void ScheduleDAGMI::
findRootsAndBiasEdges(SmallVectorImpl<SUnit * > & TopRoots,SmallVectorImpl<SUnit * > & BotRoots)853 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
854 SmallVectorImpl<SUnit*> &BotRoots) {
855 for (SUnit &SU : SUnits) {
856 assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
857
858 // Order predecessors so DFSResult follows the critical path.
859 SU.biasCriticalPath();
860
861 // A SUnit is ready to top schedule if it has no predecessors.
862 if (!SU.NumPredsLeft)
863 TopRoots.push_back(&SU);
864 // A SUnit is ready to bottom schedule if it has no successors.
865 if (!SU.NumSuccsLeft)
866 BotRoots.push_back(&SU);
867 }
868 ExitSU.biasCriticalPath();
869 }
870
871 /// Identify DAG roots and setup scheduler queues.
initQueues(ArrayRef<SUnit * > TopRoots,ArrayRef<SUnit * > BotRoots)872 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
873 ArrayRef<SUnit*> BotRoots) {
874 NextClusterSucc = nullptr;
875 NextClusterPred = nullptr;
876
877 // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
878 //
879 // Nodes with unreleased weak edges can still be roots.
880 // Release top roots in forward order.
881 for (SUnit *SU : TopRoots)
882 SchedImpl->releaseTopNode(SU);
883
884 // Release bottom roots in reverse order so the higher priority nodes appear
885 // first. This is more natural and slightly more efficient.
886 for (SmallVectorImpl<SUnit*>::const_reverse_iterator
887 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
888 SchedImpl->releaseBottomNode(*I);
889 }
890
891 releaseSuccessors(&EntrySU);
892 releasePredecessors(&ExitSU);
893
894 SchedImpl->registerRoots();
895
896 // Advance past initial DebugValues.
897 CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
898 CurrentBottom = RegionEnd;
899 }
900
901 /// Update scheduler queues after scheduling an instruction.
updateQueues(SUnit * SU,bool IsTopNode)902 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
903 // Release dependent instructions for scheduling.
904 if (IsTopNode)
905 releaseSuccessors(SU);
906 else
907 releasePredecessors(SU);
908
909 SU->isScheduled = true;
910 }
911
912 /// Reinsert any remaining debug_values, just like the PostRA scheduler.
placeDebugValues()913 void ScheduleDAGMI::placeDebugValues() {
914 // If first instruction was a DBG_VALUE then put it back.
915 if (FirstDbgValue) {
916 BB->splice(RegionBegin, BB, FirstDbgValue);
917 RegionBegin = FirstDbgValue;
918 }
919
920 for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
921 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
922 std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
923 MachineInstr *DbgValue = P.first;
924 MachineBasicBlock::iterator OrigPrevMI = P.second;
925 if (&*RegionBegin == DbgValue)
926 ++RegionBegin;
927 BB->splice(std::next(OrigPrevMI), BB, DbgValue);
928 if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
929 RegionEnd = DbgValue;
930 }
931 }
932
933 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dumpSchedule() const934 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
935 for (MachineInstr &MI : *this) {
936 if (SUnit *SU = getSUnit(&MI))
937 dumpNode(*SU);
938 else
939 dbgs() << "Missing SUnit\n";
940 }
941 }
942 #endif
943
944 //===----------------------------------------------------------------------===//
945 // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
946 // preservation.
947 //===----------------------------------------------------------------------===//
948
~ScheduleDAGMILive()949 ScheduleDAGMILive::~ScheduleDAGMILive() {
950 delete DFSResult;
951 }
952
collectVRegUses(SUnit & SU)953 void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
954 const MachineInstr &MI = *SU.getInstr();
955 for (const MachineOperand &MO : MI.operands()) {
956 if (!MO.isReg())
957 continue;
958 if (!MO.readsReg())
959 continue;
960 if (TrackLaneMasks && !MO.isUse())
961 continue;
962
963 Register Reg = MO.getReg();
964 if (!Reg.isVirtual())
965 continue;
966
967 // Ignore re-defs.
968 if (TrackLaneMasks) {
969 bool FoundDef = false;
970 for (const MachineOperand &MO2 : MI.operands()) {
971 if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
972 FoundDef = true;
973 break;
974 }
975 }
976 if (FoundDef)
977 continue;
978 }
979
980 // Record this local VReg use.
981 VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
982 for (; UI != VRegUses.end(); ++UI) {
983 if (UI->SU == &SU)
984 break;
985 }
986 if (UI == VRegUses.end())
987 VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU));
988 }
989 }
990
991 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
992 /// crossing a scheduling boundary. [begin, end) includes all instructions in
993 /// the region, including the boundary itself and single-instruction regions
994 /// that don't get scheduled.
enterRegion(MachineBasicBlock * bb,MachineBasicBlock::iterator begin,MachineBasicBlock::iterator end,unsigned regioninstrs)995 void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
996 MachineBasicBlock::iterator begin,
997 MachineBasicBlock::iterator end,
998 unsigned regioninstrs)
999 {
1000 // ScheduleDAGMI initializes SchedImpl's per-region policy.
1001 ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
1002
1003 // For convenience remember the end of the liveness region.
1004 LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
1005
1006 SUPressureDiffs.clear();
1007
1008 ShouldTrackPressure = SchedImpl->shouldTrackPressure();
1009 ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
1010
1011 assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
1012 "ShouldTrackLaneMasks requires ShouldTrackPressure");
1013 }
1014
1015 // Setup the register pressure trackers for the top scheduled and bottom
1016 // scheduled regions.
initRegPressure()1017 void ScheduleDAGMILive::initRegPressure() {
1018 VRegUses.clear();
1019 VRegUses.setUniverse(MRI.getNumVirtRegs());
1020 for (SUnit &SU : SUnits)
1021 collectVRegUses(SU);
1022
1023 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
1024 ShouldTrackLaneMasks, false);
1025 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1026 ShouldTrackLaneMasks, false);
1027
1028 // Close the RPTracker to finalize live ins.
1029 RPTracker.closeRegion();
1030
1031 LLVM_DEBUG(RPTracker.dump());
1032
1033 // Initialize the live ins and live outs.
1034 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
1035 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
1036
1037 // Close one end of the tracker so we can call
1038 // getMaxUpward/DownwardPressureDelta before advancing across any
1039 // instructions. This converts currently live regs into live ins/outs.
1040 TopRPTracker.closeTop();
1041 BotRPTracker.closeBottom();
1042
1043 BotRPTracker.initLiveThru(RPTracker);
1044 if (!BotRPTracker.getLiveThru().empty()) {
1045 TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
1046 LLVM_DEBUG(dbgs() << "Live Thru: ";
1047 dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
1048 };
1049
1050 // For each live out vreg reduce the pressure change associated with other
1051 // uses of the same vreg below the live-out reaching def.
1052 updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
1053
1054 // Account for liveness generated by the region boundary.
1055 if (LiveRegionEnd != RegionEnd) {
1056 SmallVector<RegisterMaskPair, 8> LiveUses;
1057 BotRPTracker.recede(&LiveUses);
1058 updatePressureDiffs(LiveUses);
1059 }
1060
1061 LLVM_DEBUG(dbgs() << "Top Pressure:\n";
1062 dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
1063 dbgs() << "Bottom Pressure:\n";
1064 dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
1065
1066 assert((BotRPTracker.getPos() == RegionEnd ||
1067 (RegionEnd->isDebugInstr() &&
1068 BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
1069 "Can't find the region bottom");
1070
1071 // Cache the list of excess pressure sets in this region. This will also track
1072 // the max pressure in the scheduled code for these sets.
1073 RegionCriticalPSets.clear();
1074 const std::vector<unsigned> &RegionPressure =
1075 RPTracker.getPressure().MaxSetPressure;
1076 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
1077 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
1078 if (RegionPressure[i] > Limit) {
1079 LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
1080 << " Actual " << RegionPressure[i] << "\n");
1081 RegionCriticalPSets.push_back(PressureChange(i));
1082 }
1083 }
1084 LLVM_DEBUG(dbgs() << "Excess PSets: ";
1085 for (const PressureChange &RCPS
1086 : RegionCriticalPSets) dbgs()
1087 << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
1088 dbgs() << "\n");
1089 }
1090
1091 void ScheduleDAGMILive::
updateScheduledPressure(const SUnit * SU,const std::vector<unsigned> & NewMaxPressure)1092 updateScheduledPressure(const SUnit *SU,
1093 const std::vector<unsigned> &NewMaxPressure) {
1094 const PressureDiff &PDiff = getPressureDiff(SU);
1095 unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
1096 for (const PressureChange &PC : PDiff) {
1097 if (!PC.isValid())
1098 break;
1099 unsigned ID = PC.getPSet();
1100 while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
1101 ++CritIdx;
1102 if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
1103 if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
1104 && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
1105 RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
1106 }
1107 unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
1108 if (NewMaxPressure[ID] >= Limit - 2) {
1109 LLVM_DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": "
1110 << NewMaxPressure[ID]
1111 << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
1112 << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
1113 << " livethru)\n");
1114 }
1115 }
1116 }
1117
1118 /// Update the PressureDiff array for liveness after scheduling this
1119 /// instruction.
updatePressureDiffs(ArrayRef<RegisterMaskPair> LiveUses)1120 void ScheduleDAGMILive::updatePressureDiffs(
1121 ArrayRef<RegisterMaskPair> LiveUses) {
1122 for (const RegisterMaskPair &P : LiveUses) {
1123 Register Reg = P.RegUnit;
1124 /// FIXME: Currently assuming single-use physregs.
1125 if (!Reg.isVirtual())
1126 continue;
1127
1128 if (ShouldTrackLaneMasks) {
1129 // If the register has just become live then other uses won't change
1130 // this fact anymore => decrement pressure.
1131 // If the register has just become dead then other uses make it come
1132 // back to life => increment pressure.
1133 bool Decrement = P.LaneMask.any();
1134
1135 for (const VReg2SUnit &V2SU
1136 : make_range(VRegUses.find(Reg), VRegUses.end())) {
1137 SUnit &SU = *V2SU.SU;
1138 if (SU.isScheduled || &SU == &ExitSU)
1139 continue;
1140
1141 PressureDiff &PDiff = getPressureDiff(&SU);
1142 PDiff.addPressureChange(Reg, Decrement, &MRI);
1143 LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU.NodeNum << ") "
1144 << printReg(Reg, TRI) << ':'
1145 << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
1146 dbgs() << " to "; PDiff.dump(*TRI););
1147 }
1148 } else {
1149 assert(P.LaneMask.any());
1150 LLVM_DEBUG(dbgs() << " LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
1151 // This may be called before CurrentBottom has been initialized. However,
1152 // BotRPTracker must have a valid position. We want the value live into the
1153 // instruction or live out of the block, so ask for the previous
1154 // instruction's live-out.
1155 const LiveInterval &LI = LIS->getInterval(Reg);
1156 VNInfo *VNI;
1157 MachineBasicBlock::const_iterator I =
1158 nextIfDebug(BotRPTracker.getPos(), BB->end());
1159 if (I == BB->end())
1160 VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1161 else {
1162 LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
1163 VNI = LRQ.valueIn();
1164 }
1165 // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
1166 assert(VNI && "No live value at use.");
1167 for (const VReg2SUnit &V2SU
1168 : make_range(VRegUses.find(Reg), VRegUses.end())) {
1169 SUnit *SU = V2SU.SU;
1170 // If this use comes before the reaching def, it cannot be a last use,
1171 // so decrease its pressure change.
1172 if (!SU->isScheduled && SU != &ExitSU) {
1173 LiveQueryResult LRQ =
1174 LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1175 if (LRQ.valueIn() == VNI) {
1176 PressureDiff &PDiff = getPressureDiff(SU);
1177 PDiff.addPressureChange(Reg, true, &MRI);
1178 LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") "
1179 << *SU->getInstr();
1180 dbgs() << " to "; PDiff.dump(*TRI););
1181 }
1182 }
1183 }
1184 }
1185 }
1186 }
1187
dump() const1188 void ScheduleDAGMILive::dump() const {
1189 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1190 if (EntrySU.getInstr() != nullptr)
1191 dumpNodeAll(EntrySU);
1192 for (const SUnit &SU : SUnits) {
1193 dumpNodeAll(SU);
1194 if (ShouldTrackPressure) {
1195 dbgs() << " Pressure Diff : ";
1196 getPressureDiff(&SU).dump(*TRI);
1197 }
1198 dbgs() << " Single Issue : ";
1199 if (SchedModel.mustBeginGroup(SU.getInstr()) &&
1200 SchedModel.mustEndGroup(SU.getInstr()))
1201 dbgs() << "true;";
1202 else
1203 dbgs() << "false;";
1204 dbgs() << '\n';
1205 }
1206 if (ExitSU.getInstr() != nullptr)
1207 dumpNodeAll(ExitSU);
1208 #endif
1209 }
1210
1211 /// schedule - Called back from MachineScheduler::runOnMachineFunction
1212 /// after setting up the current scheduling region. [RegionBegin, RegionEnd)
1213 /// only includes instructions that have DAG nodes, not scheduling boundaries.
1214 ///
1215 /// This is a skeletal driver, with all the functionality pushed into helpers,
1216 /// so that it can be easily extended by experimental schedulers. Generally,
1217 /// implementing MachineSchedStrategy should be sufficient to implement a new
1218 /// scheduling algorithm. However, if a scheduler further subclasses
1219 /// ScheduleDAGMILive then it will want to override this virtual method in order
1220 /// to update any specialized state.
schedule()1221 void ScheduleDAGMILive::schedule() {
1222 LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
1223 LLVM_DEBUG(SchedImpl->dumpPolicy());
1224 buildDAGWithRegPressure();
1225
1226 postprocessDAG();
1227
1228 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1229 findRootsAndBiasEdges(TopRoots, BotRoots);
1230
1231 // Initialize the strategy before modifying the DAG.
1232 // This may initialize a DFSResult to be used for queue priority.
1233 SchedImpl->initialize(this);
1234
1235 LLVM_DEBUG(dump());
1236 if (PrintDAGs) dump();
1237 if (ViewMISchedDAGs) viewGraph();
1238
1239 // Initialize ready queues now that the DAG and priority data are finalized.
1240 initQueues(TopRoots, BotRoots);
1241
1242 bool IsTopNode = false;
1243 while (true) {
1244 LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
1245 SUnit *SU = SchedImpl->pickNode(IsTopNode);
1246 if (!SU) break;
1247
1248 assert(!SU->isScheduled && "Node already scheduled");
1249 if (!checkSchedLimit())
1250 break;
1251
1252 scheduleMI(SU, IsTopNode);
1253
1254 if (DFSResult) {
1255 unsigned SubtreeID = DFSResult->getSubtreeID(SU);
1256 if (!ScheduledTrees.test(SubtreeID)) {
1257 ScheduledTrees.set(SubtreeID);
1258 DFSResult->scheduleTree(SubtreeID);
1259 SchedImpl->scheduleTree(SubtreeID);
1260 }
1261 }
1262
1263 // Notify the scheduling strategy after updating the DAG.
1264 SchedImpl->schedNode(SU, IsTopNode);
1265
1266 updateQueues(SU, IsTopNode);
1267 }
1268 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1269
1270 placeDebugValues();
1271
1272 LLVM_DEBUG({
1273 dbgs() << "*** Final schedule for "
1274 << printMBBReference(*begin()->getParent()) << " ***\n";
1275 dumpSchedule();
1276 dbgs() << '\n';
1277 });
1278 }
1279
1280 /// Build the DAG and setup three register pressure trackers.
buildDAGWithRegPressure()1281 void ScheduleDAGMILive::buildDAGWithRegPressure() {
1282 if (!ShouldTrackPressure) {
1283 RPTracker.reset();
1284 RegionCriticalPSets.clear();
1285 buildSchedGraph(AA);
1286 return;
1287 }
1288
1289 // Initialize the register pressure tracker used by buildSchedGraph.
1290 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1291 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
1292
1293 // Account for liveness generate by the region boundary.
1294 if (LiveRegionEnd != RegionEnd)
1295 RPTracker.recede();
1296
1297 // Build the DAG, and compute current register pressure.
1298 buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
1299
1300 // Initialize top/bottom trackers after computing region pressure.
1301 initRegPressure();
1302 }
1303
computeDFSResult()1304 void ScheduleDAGMILive::computeDFSResult() {
1305 if (!DFSResult)
1306 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
1307 DFSResult->clear();
1308 ScheduledTrees.clear();
1309 DFSResult->resize(SUnits.size());
1310 DFSResult->compute(SUnits);
1311 ScheduledTrees.resize(DFSResult->getNumSubtrees());
1312 }
1313
1314 /// Compute the max cyclic critical path through the DAG. The scheduling DAG
1315 /// only provides the critical path for single block loops. To handle loops that
1316 /// span blocks, we could use the vreg path latencies provided by
1317 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
1318 /// available for use in the scheduler.
1319 ///
1320 /// The cyclic path estimation identifies a def-use pair that crosses the back
1321 /// edge and considers the depth and height of the nodes. For example, consider
1322 /// the following instruction sequence where each instruction has unit latency
1323 /// and defines an eponymous virtual register:
1324 ///
1325 /// a->b(a,c)->c(b)->d(c)->exit
1326 ///
1327 /// The cyclic critical path is a two cycles: b->c->b
1328 /// The acyclic critical path is four cycles: a->b->c->d->exit
1329 /// LiveOutHeight = height(c) = len(c->d->exit) = 2
1330 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
1331 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
1332 /// LiveInDepth = depth(b) = len(a->b) = 1
1333 ///
1334 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2
1335 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2
1336 /// CyclicCriticalPath = min(2, 2) = 2
1337 ///
1338 /// This could be relevant to PostRA scheduling, but is currently implemented
1339 /// assuming LiveIntervals.
computeCyclicCriticalPath()1340 unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
1341 // This only applies to single block loop.
1342 if (!BB->isSuccessor(BB))
1343 return 0;
1344
1345 unsigned MaxCyclicLatency = 0;
1346 // Visit each live out vreg def to find def/use pairs that cross iterations.
1347 for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
1348 Register Reg = P.RegUnit;
1349 if (!Reg.isVirtual())
1350 continue;
1351 const LiveInterval &LI = LIS->getInterval(Reg);
1352 const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1353 if (!DefVNI)
1354 continue;
1355
1356 MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
1357 const SUnit *DefSU = getSUnit(DefMI);
1358 if (!DefSU)
1359 continue;
1360
1361 unsigned LiveOutHeight = DefSU->getHeight();
1362 unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
1363 // Visit all local users of the vreg def.
1364 for (const VReg2SUnit &V2SU
1365 : make_range(VRegUses.find(Reg), VRegUses.end())) {
1366 SUnit *SU = V2SU.SU;
1367 if (SU == &ExitSU)
1368 continue;
1369
1370 // Only consider uses of the phi.
1371 LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1372 if (!LRQ.valueIn()->isPHIDef())
1373 continue;
1374
1375 // Assume that a path spanning two iterations is a cycle, which could
1376 // overestimate in strange cases. This allows cyclic latency to be
1377 // estimated as the minimum slack of the vreg's depth or height.
1378 unsigned CyclicLatency = 0;
1379 if (LiveOutDepth > SU->getDepth())
1380 CyclicLatency = LiveOutDepth - SU->getDepth();
1381
1382 unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
1383 if (LiveInHeight > LiveOutHeight) {
1384 if (LiveInHeight - LiveOutHeight < CyclicLatency)
1385 CyclicLatency = LiveInHeight - LiveOutHeight;
1386 } else
1387 CyclicLatency = 0;
1388
1389 LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
1390 << SU->NodeNum << ") = " << CyclicLatency << "c\n");
1391 if (CyclicLatency > MaxCyclicLatency)
1392 MaxCyclicLatency = CyclicLatency;
1393 }
1394 }
1395 LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
1396 return MaxCyclicLatency;
1397 }
1398
1399 /// Release ExitSU predecessors and setup scheduler queues. Re-position
1400 /// the Top RP tracker in case the region beginning has changed.
initQueues(ArrayRef<SUnit * > TopRoots,ArrayRef<SUnit * > BotRoots)1401 void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
1402 ArrayRef<SUnit*> BotRoots) {
1403 ScheduleDAGMI::initQueues(TopRoots, BotRoots);
1404 if (ShouldTrackPressure) {
1405 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1406 TopRPTracker.setPos(CurrentTop);
1407 }
1408 }
1409
1410 /// Move an instruction and update register pressure.
scheduleMI(SUnit * SU,bool IsTopNode)1411 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
1412 // Move the instruction to its new location in the instruction stream.
1413 MachineInstr *MI = SU->getInstr();
1414
1415 if (IsTopNode) {
1416 assert(SU->isTopReady() && "node still has unscheduled dependencies");
1417 if (&*CurrentTop == MI)
1418 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
1419 else {
1420 moveInstruction(MI, CurrentTop);
1421 TopRPTracker.setPos(MI);
1422 }
1423
1424 if (ShouldTrackPressure) {
1425 // Update top scheduled pressure.
1426 RegisterOperands RegOpers;
1427 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1428 if (ShouldTrackLaneMasks) {
1429 // Adjust liveness and add missing dead+read-undef flags.
1430 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1431 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1432 } else {
1433 // Adjust for missing dead-def flags.
1434 RegOpers.detectDeadDefs(*MI, *LIS);
1435 }
1436
1437 TopRPTracker.advance(RegOpers);
1438 assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
1439 LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
1440 TopRPTracker.getRegSetPressureAtPos(), TRI););
1441
1442 updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
1443 }
1444 } else {
1445 assert(SU->isBottomReady() && "node still has unscheduled dependencies");
1446 MachineBasicBlock::iterator priorII =
1447 priorNonDebug(CurrentBottom, CurrentTop);
1448 if (&*priorII == MI)
1449 CurrentBottom = priorII;
1450 else {
1451 if (&*CurrentTop == MI) {
1452 CurrentTop = nextIfDebug(++CurrentTop, priorII);
1453 TopRPTracker.setPos(CurrentTop);
1454 }
1455 moveInstruction(MI, CurrentBottom);
1456 CurrentBottom = MI;
1457 BotRPTracker.setPos(CurrentBottom);
1458 }
1459 if (ShouldTrackPressure) {
1460 RegisterOperands RegOpers;
1461 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1462 if (ShouldTrackLaneMasks) {
1463 // Adjust liveness and add missing dead+read-undef flags.
1464 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1465 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1466 } else {
1467 // Adjust for missing dead-def flags.
1468 RegOpers.detectDeadDefs(*MI, *LIS);
1469 }
1470
1471 if (BotRPTracker.getPos() != CurrentBottom)
1472 BotRPTracker.recedeSkipDebugValues();
1473 SmallVector<RegisterMaskPair, 8> LiveUses;
1474 BotRPTracker.recede(RegOpers, &LiveUses);
1475 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
1476 LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
1477 BotRPTracker.getRegSetPressureAtPos(), TRI););
1478
1479 updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
1480 updatePressureDiffs(LiveUses);
1481 }
1482 }
1483 }
1484
1485 //===----------------------------------------------------------------------===//
1486 // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
1487 //===----------------------------------------------------------------------===//
1488
1489 namespace {
1490
1491 /// Post-process the DAG to create cluster edges between neighboring
1492 /// loads or between neighboring stores.
1493 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
1494 struct MemOpInfo {
1495 SUnit *SU;
1496 SmallVector<const MachineOperand *, 4> BaseOps;
1497 int64_t Offset;
1498 unsigned Width;
1499
MemOpInfo__anon6ea857550311::BaseMemOpClusterMutation::MemOpInfo1500 MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
1501 int64_t Offset, unsigned Width)
1502 : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
1503 Width(Width) {}
1504
Compare__anon6ea857550311::BaseMemOpClusterMutation::MemOpInfo1505 static bool Compare(const MachineOperand *const &A,
1506 const MachineOperand *const &B) {
1507 if (A->getType() != B->getType())
1508 return A->getType() < B->getType();
1509 if (A->isReg())
1510 return A->getReg() < B->getReg();
1511 if (A->isFI()) {
1512 const MachineFunction &MF = *A->getParent()->getParent()->getParent();
1513 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
1514 bool StackGrowsDown = TFI.getStackGrowthDirection() ==
1515 TargetFrameLowering::StackGrowsDown;
1516 return StackGrowsDown ? A->getIndex() > B->getIndex()
1517 : A->getIndex() < B->getIndex();
1518 }
1519
1520 llvm_unreachable("MemOpClusterMutation only supports register or frame "
1521 "index bases.");
1522 }
1523
operator <__anon6ea857550311::BaseMemOpClusterMutation::MemOpInfo1524 bool operator<(const MemOpInfo &RHS) const {
1525 // FIXME: Don't compare everything twice. Maybe use C++20 three way
1526 // comparison instead when it's available.
1527 if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
1528 RHS.BaseOps.begin(), RHS.BaseOps.end(),
1529 Compare))
1530 return true;
1531 if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
1532 BaseOps.begin(), BaseOps.end(), Compare))
1533 return false;
1534 if (Offset != RHS.Offset)
1535 return Offset < RHS.Offset;
1536 return SU->NodeNum < RHS.SU->NodeNum;
1537 }
1538 };
1539
1540 const TargetInstrInfo *TII;
1541 const TargetRegisterInfo *TRI;
1542 bool IsLoad;
1543
1544 public:
BaseMemOpClusterMutation(const TargetInstrInfo * tii,const TargetRegisterInfo * tri,bool IsLoad)1545 BaseMemOpClusterMutation(const TargetInstrInfo *tii,
1546 const TargetRegisterInfo *tri, bool IsLoad)
1547 : TII(tii), TRI(tri), IsLoad(IsLoad) {}
1548
1549 void apply(ScheduleDAGInstrs *DAGInstrs) override;
1550
1551 protected:
1552 void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
1553 ScheduleDAGInstrs *DAG);
1554 void collectMemOpRecords(std::vector<SUnit> &SUnits,
1555 SmallVectorImpl<MemOpInfo> &MemOpRecords);
1556 bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1557 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
1558 };
1559
1560 class StoreClusterMutation : public BaseMemOpClusterMutation {
1561 public:
StoreClusterMutation(const TargetInstrInfo * tii,const TargetRegisterInfo * tri)1562 StoreClusterMutation(const TargetInstrInfo *tii,
1563 const TargetRegisterInfo *tri)
1564 : BaseMemOpClusterMutation(tii, tri, false) {}
1565 };
1566
1567 class LoadClusterMutation : public BaseMemOpClusterMutation {
1568 public:
LoadClusterMutation(const TargetInstrInfo * tii,const TargetRegisterInfo * tri)1569 LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
1570 : BaseMemOpClusterMutation(tii, tri, true) {}
1571 };
1572
1573 } // end anonymous namespace
1574
1575 namespace llvm {
1576
1577 std::unique_ptr<ScheduleDAGMutation>
createLoadClusterDAGMutation(const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)1578 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
1579 const TargetRegisterInfo *TRI) {
1580 return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
1581 : nullptr;
1582 }
1583
1584 std::unique_ptr<ScheduleDAGMutation>
createStoreClusterDAGMutation(const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)1585 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
1586 const TargetRegisterInfo *TRI) {
1587 return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
1588 : nullptr;
1589 }
1590
1591 } // end namespace llvm
1592
1593 // Sorting all the loads/stores first, then for each load/store, checking the
1594 // following load/store one by one, until reach the first non-dependent one and
1595 // call target hook to see if they can cluster.
1596 // If FastCluster is enabled, we assume that, all the loads/stores have been
1597 // preprocessed and now, they didn't have dependencies on each other.
clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOpRecords,bool FastCluster,ScheduleDAGInstrs * DAG)1598 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
1599 ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
1600 ScheduleDAGInstrs *DAG) {
1601 // Keep track of the current cluster length and bytes for each SUnit.
1602 DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
1603
1604 // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
1605 // cluster mem ops collected within `MemOpRecords` array.
1606 for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
1607 // Decision to cluster mem ops is taken based on target dependent logic
1608 auto MemOpa = MemOpRecords[Idx];
1609
1610 // Seek for the next load/store to do the cluster.
1611 unsigned NextIdx = Idx + 1;
1612 for (; NextIdx < End; ++NextIdx)
1613 // Skip if MemOpb has been clustered already or has dependency with
1614 // MemOpa.
1615 if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
1616 (FastCluster ||
1617 (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
1618 !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
1619 break;
1620 if (NextIdx == End)
1621 continue;
1622
1623 auto MemOpb = MemOpRecords[NextIdx];
1624 unsigned ClusterLength = 2;
1625 unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
1626 if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
1627 ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
1628 CurrentClusterBytes =
1629 SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
1630 }
1631
1632 if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
1633 CurrentClusterBytes))
1634 continue;
1635
1636 SUnit *SUa = MemOpa.SU;
1637 SUnit *SUb = MemOpb.SU;
1638 if (SUa->NodeNum > SUb->NodeNum)
1639 std::swap(SUa, SUb);
1640
1641 // FIXME: Is this check really required?
1642 if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
1643 continue;
1644
1645 LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
1646 << SUb->NodeNum << ")\n");
1647 ++NumClustered;
1648
1649 if (IsLoad) {
1650 // Copy successor edges from SUa to SUb. Interleaving computation
1651 // dependent on SUa can prevent load combining due to register reuse.
1652 // Predecessor edges do not need to be copied from SUb to SUa since
1653 // nearby loads should have effectively the same inputs.
1654 for (const SDep &Succ : SUa->Succs) {
1655 if (Succ.getSUnit() == SUb)
1656 continue;
1657 LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum
1658 << ")\n");
1659 DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1660 }
1661 } else {
1662 // Copy predecessor edges from SUb to SUa to avoid the SUnits that
1663 // SUb dependent on scheduled in-between SUb and SUa. Successor edges
1664 // do not need to be copied from SUa to SUb since no one will depend
1665 // on stores.
1666 // Notice that, we don't need to care about the memory dependency as
1667 // we won't try to cluster them if they have any memory dependency.
1668 for (const SDep &Pred : SUb->Preds) {
1669 if (Pred.getSUnit() == SUa)
1670 continue;
1671 LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum
1672 << ")\n");
1673 DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
1674 }
1675 }
1676
1677 SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
1678 CurrentClusterBytes};
1679
1680 LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength
1681 << ", Curr cluster bytes: " << CurrentClusterBytes
1682 << "\n");
1683 }
1684 }
1685
collectMemOpRecords(std::vector<SUnit> & SUnits,SmallVectorImpl<MemOpInfo> & MemOpRecords)1686 void BaseMemOpClusterMutation::collectMemOpRecords(
1687 std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
1688 for (auto &SU : SUnits) {
1689 if ((IsLoad && !SU.getInstr()->mayLoad()) ||
1690 (!IsLoad && !SU.getInstr()->mayStore()))
1691 continue;
1692
1693 const MachineInstr &MI = *SU.getInstr();
1694 SmallVector<const MachineOperand *, 4> BaseOps;
1695 int64_t Offset;
1696 bool OffsetIsScalable;
1697 unsigned Width;
1698 if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
1699 OffsetIsScalable, Width, TRI)) {
1700 MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
1701
1702 LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
1703 << Offset << ", OffsetIsScalable: " << OffsetIsScalable
1704 << ", Width: " << Width << "\n");
1705 }
1706 #ifndef NDEBUG
1707 for (const auto *Op : BaseOps)
1708 assert(Op);
1709 #endif
1710 }
1711 }
1712
groupMemOps(ArrayRef<MemOpInfo> MemOps,ScheduleDAGInstrs * DAG,DenseMap<unsigned,SmallVector<MemOpInfo,32>> & Groups)1713 bool BaseMemOpClusterMutation::groupMemOps(
1714 ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1715 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
1716 bool FastCluster =
1717 ForceFastCluster ||
1718 MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
1719
1720 for (const auto &MemOp : MemOps) {
1721 unsigned ChainPredID = DAG->SUnits.size();
1722 if (FastCluster) {
1723 for (const SDep &Pred : MemOp.SU->Preds) {
1724 // We only want to cluster the mem ops that have the same ctrl(non-data)
1725 // pred so that they didn't have ctrl dependency for each other. But for
1726 // store instrs, we can still cluster them if the pred is load instr.
1727 if ((Pred.isCtrl() &&
1728 (IsLoad ||
1729 (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
1730 !Pred.isArtificial()) {
1731 ChainPredID = Pred.getSUnit()->NodeNum;
1732 break;
1733 }
1734 }
1735 } else
1736 ChainPredID = 0;
1737
1738 Groups[ChainPredID].push_back(MemOp);
1739 }
1740 return FastCluster;
1741 }
1742
1743 /// Callback from DAG postProcessing to create cluster edges for loads/stores.
apply(ScheduleDAGInstrs * DAG)1744 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
1745 // Collect all the clusterable loads/stores
1746 SmallVector<MemOpInfo, 32> MemOpRecords;
1747 collectMemOpRecords(DAG->SUnits, MemOpRecords);
1748
1749 if (MemOpRecords.size() < 2)
1750 return;
1751
1752 // Put the loads/stores without dependency into the same group with some
1753 // heuristic if the DAG is too complex to avoid compiling time blow up.
1754 // Notice that, some fusion pair could be lost with this.
1755 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
1756 bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
1757
1758 for (auto &Group : Groups) {
1759 // Sorting the loads/stores, so that, we can stop the cluster as early as
1760 // possible.
1761 llvm::sort(Group.second);
1762
1763 // Trying to cluster all the neighboring loads/stores.
1764 clusterNeighboringMemOps(Group.second, FastCluster, DAG);
1765 }
1766 }
1767
1768 //===----------------------------------------------------------------------===//
1769 // CopyConstrain - DAG post-processing to encourage copy elimination.
1770 //===----------------------------------------------------------------------===//
1771
1772 namespace {
1773
1774 /// Post-process the DAG to create weak edges from all uses of a copy to
1775 /// the one use that defines the copy's source vreg, most likely an induction
1776 /// variable increment.
1777 class CopyConstrain : public ScheduleDAGMutation {
1778 // Transient state.
1779 SlotIndex RegionBeginIdx;
1780
1781 // RegionEndIdx is the slot index of the last non-debug instruction in the
1782 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
1783 SlotIndex RegionEndIdx;
1784
1785 public:
CopyConstrain(const TargetInstrInfo *,const TargetRegisterInfo *)1786 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
1787
1788 void apply(ScheduleDAGInstrs *DAGInstrs) override;
1789
1790 protected:
1791 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
1792 };
1793
1794 } // end anonymous namespace
1795
1796 namespace llvm {
1797
1798 std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)1799 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
1800 const TargetRegisterInfo *TRI) {
1801 return std::make_unique<CopyConstrain>(TII, TRI);
1802 }
1803
1804 } // end namespace llvm
1805
1806 /// constrainLocalCopy handles two possibilities:
1807 /// 1) Local src:
1808 /// I0: = dst
1809 /// I1: src = ...
1810 /// I2: = dst
1811 /// I3: dst = src (copy)
1812 /// (create pred->succ edges I0->I1, I2->I1)
1813 ///
1814 /// 2) Local copy:
1815 /// I0: dst = src (copy)
1816 /// I1: = dst
1817 /// I2: src = ...
1818 /// I3: = dst
1819 /// (create pred->succ edges I1->I2, I3->I2)
1820 ///
1821 /// Although the MachineScheduler is currently constrained to single blocks,
1822 /// this algorithm should handle extended blocks. An EBB is a set of
1823 /// contiguously numbered blocks such that the previous block in the EBB is
1824 /// always the single predecessor.
constrainLocalCopy(SUnit * CopySU,ScheduleDAGMILive * DAG)1825 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
1826 LiveIntervals *LIS = DAG->getLIS();
1827 MachineInstr *Copy = CopySU->getInstr();
1828
1829 // Check for pure vreg copies.
1830 const MachineOperand &SrcOp = Copy->getOperand(1);
1831 Register SrcReg = SrcOp.getReg();
1832 if (!SrcReg.isVirtual() || !SrcOp.readsReg())
1833 return;
1834
1835 const MachineOperand &DstOp = Copy->getOperand(0);
1836 Register DstReg = DstOp.getReg();
1837 if (!DstReg.isVirtual() || DstOp.isDead())
1838 return;
1839
1840 // Check if either the dest or source is local. If it's live across a back
1841 // edge, it's not local. Note that if both vregs are live across the back
1842 // edge, we cannot successfully contrain the copy without cyclic scheduling.
1843 // If both the copy's source and dest are local live intervals, then we
1844 // should treat the dest as the global for the purpose of adding
1845 // constraints. This adds edges from source's other uses to the copy.
1846 unsigned LocalReg = SrcReg;
1847 unsigned GlobalReg = DstReg;
1848 LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
1849 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
1850 LocalReg = DstReg;
1851 GlobalReg = SrcReg;
1852 LocalLI = &LIS->getInterval(LocalReg);
1853 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
1854 return;
1855 }
1856 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
1857
1858 // Find the global segment after the start of the local LI.
1859 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
1860 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
1861 // local live range. We could create edges from other global uses to the local
1862 // start, but the coalescer should have already eliminated these cases, so
1863 // don't bother dealing with it.
1864 if (GlobalSegment == GlobalLI->end())
1865 return;
1866
1867 // If GlobalSegment is killed at the LocalLI->start, the call to find()
1868 // returned the next global segment. But if GlobalSegment overlaps with
1869 // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
1870 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
1871 if (GlobalSegment->contains(LocalLI->beginIndex()))
1872 ++GlobalSegment;
1873
1874 if (GlobalSegment == GlobalLI->end())
1875 return;
1876
1877 // Check if GlobalLI contains a hole in the vicinity of LocalLI.
1878 if (GlobalSegment != GlobalLI->begin()) {
1879 // Two address defs have no hole.
1880 if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
1881 GlobalSegment->start)) {
1882 return;
1883 }
1884 // If the prior global segment may be defined by the same two-address
1885 // instruction that also defines LocalLI, then can't make a hole here.
1886 if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
1887 LocalLI->beginIndex())) {
1888 return;
1889 }
1890 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
1891 // it would be a disconnected component in the live range.
1892 assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
1893 "Disconnected LRG within the scheduling region.");
1894 }
1895 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
1896 if (!GlobalDef)
1897 return;
1898
1899 SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
1900 if (!GlobalSU)
1901 return;
1902
1903 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
1904 // constraining the uses of the last local def to precede GlobalDef.
1905 SmallVector<SUnit*,8> LocalUses;
1906 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
1907 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
1908 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
1909 for (const SDep &Succ : LastLocalSU->Succs) {
1910 if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
1911 continue;
1912 if (Succ.getSUnit() == GlobalSU)
1913 continue;
1914 if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
1915 return;
1916 LocalUses.push_back(Succ.getSUnit());
1917 }
1918 // Open the top of the GlobalLI hole by constraining any earlier global uses
1919 // to precede the start of LocalLI.
1920 SmallVector<SUnit*,8> GlobalUses;
1921 MachineInstr *FirstLocalDef =
1922 LIS->getInstructionFromIndex(LocalLI->beginIndex());
1923 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
1924 for (const SDep &Pred : GlobalSU->Preds) {
1925 if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
1926 continue;
1927 if (Pred.getSUnit() == FirstLocalSU)
1928 continue;
1929 if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
1930 return;
1931 GlobalUses.push_back(Pred.getSUnit());
1932 }
1933 LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
1934 // Add the weak edges.
1935 for (SUnit *LU : LocalUses) {
1936 LLVM_DEBUG(dbgs() << " Local use SU(" << LU->NodeNum << ") -> SU("
1937 << GlobalSU->NodeNum << ")\n");
1938 DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));
1939 }
1940 for (SUnit *GU : GlobalUses) {
1941 LLVM_DEBUG(dbgs() << " Global use SU(" << GU->NodeNum << ") -> SU("
1942 << FirstLocalSU->NodeNum << ")\n");
1943 DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));
1944 }
1945 }
1946
1947 /// Callback from DAG postProcessing to create weak edges to encourage
1948 /// copy elimination.
apply(ScheduleDAGInstrs * DAGInstrs)1949 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
1950 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1951 assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
1952
1953 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
1954 if (FirstPos == DAG->end())
1955 return;
1956 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
1957 RegionEndIdx = DAG->getLIS()->getInstructionIndex(
1958 *priorNonDebug(DAG->end(), DAG->begin()));
1959
1960 for (SUnit &SU : DAG->SUnits) {
1961 if (!SU.getInstr()->isCopy())
1962 continue;
1963
1964 constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
1965 }
1966 }
1967
1968 //===----------------------------------------------------------------------===//
1969 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
1970 // and possibly other custom schedulers.
1971 //===----------------------------------------------------------------------===//
1972
1973 static const unsigned InvalidCycle = ~0U;
1974
~SchedBoundary()1975 SchedBoundary::~SchedBoundary() { delete HazardRec; }
1976
1977 /// Given a Count of resource usage and a Latency value, return true if a
1978 /// SchedBoundary becomes resource limited.
1979 /// If we are checking after scheduling a node, we should return true when
1980 /// we just reach the resource limit.
checkResourceLimit(unsigned LFactor,unsigned Count,unsigned Latency,bool AfterSchedNode)1981 static bool checkResourceLimit(unsigned LFactor, unsigned Count,
1982 unsigned Latency, bool AfterSchedNode) {
1983 int ResCntFactor = (int)(Count - (Latency * LFactor));
1984 if (AfterSchedNode)
1985 return ResCntFactor >= (int)LFactor;
1986 else
1987 return ResCntFactor > (int)LFactor;
1988 }
1989
reset()1990 void SchedBoundary::reset() {
1991 // A new HazardRec is created for each DAG and owned by SchedBoundary.
1992 // Destroying and reconstructing it is very expensive though. So keep
1993 // invalid, placeholder HazardRecs.
1994 if (HazardRec && HazardRec->isEnabled()) {
1995 delete HazardRec;
1996 HazardRec = nullptr;
1997 }
1998 Available.clear();
1999 Pending.clear();
2000 CheckPending = false;
2001 CurrCycle = 0;
2002 CurrMOps = 0;
2003 MinReadyCycle = std::numeric_limits<unsigned>::max();
2004 ExpectedLatency = 0;
2005 DependentLatency = 0;
2006 RetiredMOps = 0;
2007 MaxExecutedResCount = 0;
2008 ZoneCritResIdx = 0;
2009 IsResourceLimited = false;
2010 ReservedCycles.clear();
2011 ReservedCyclesIndex.clear();
2012 ResourceGroupSubUnitMasks.clear();
2013 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2014 // Track the maximum number of stall cycles that could arise either from the
2015 // latency of a DAG edge or the number of cycles that a processor resource is
2016 // reserved (SchedBoundary::ReservedCycles).
2017 MaxObservedStall = 0;
2018 #endif
2019 // Reserve a zero-count for invalid CritResIdx.
2020 ExecutedResCounts.resize(1);
2021 assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
2022 }
2023
2024 void SchedRemainder::
init(ScheduleDAGMI * DAG,const TargetSchedModel * SchedModel)2025 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
2026 reset();
2027 if (!SchedModel->hasInstrSchedModel())
2028 return;
2029 RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
2030 for (SUnit &SU : DAG->SUnits) {
2031 const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
2032 RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
2033 * SchedModel->getMicroOpFactor();
2034 for (TargetSchedModel::ProcResIter
2035 PI = SchedModel->getWriteProcResBegin(SC),
2036 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2037 unsigned PIdx = PI->ProcResourceIdx;
2038 unsigned Factor = SchedModel->getResourceFactor(PIdx);
2039 RemainingCounts[PIdx] += (Factor * PI->Cycles);
2040 }
2041 }
2042 }
2043
2044 void SchedBoundary::
init(ScheduleDAGMI * dag,const TargetSchedModel * smodel,SchedRemainder * rem)2045 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
2046 reset();
2047 DAG = dag;
2048 SchedModel = smodel;
2049 Rem = rem;
2050 if (SchedModel->hasInstrSchedModel()) {
2051 unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2052 ReservedCyclesIndex.resize(ResourceCount);
2053 ExecutedResCounts.resize(ResourceCount);
2054 ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));
2055 unsigned NumUnits = 0;
2056
2057 for (unsigned i = 0; i < ResourceCount; ++i) {
2058 ReservedCyclesIndex[i] = NumUnits;
2059 NumUnits += SchedModel->getProcResource(i)->NumUnits;
2060 if (isUnbufferedGroup(i)) {
2061 auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
2062 for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
2063 U != UE; ++U)
2064 ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]);
2065 }
2066 }
2067
2068 ReservedCycles.resize(NumUnits, InvalidCycle);
2069 }
2070 }
2071
2072 /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
2073 /// these "soft stalls" differently than the hard stall cycles based on CPU
2074 /// resources and computed by checkHazard(). A fully in-order model
2075 /// (MicroOpBufferSize==0) will not make use of this since instructions are not
2076 /// available for scheduling until they are ready. However, a weaker in-order
2077 /// model may use this for heuristics. For example, if a processor has in-order
2078 /// behavior when reading certain resources, this may come into play.
getLatencyStallCycles(SUnit * SU)2079 unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
2080 if (!SU->isUnbuffered)
2081 return 0;
2082
2083 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2084 if (ReadyCycle > CurrCycle)
2085 return ReadyCycle - CurrCycle;
2086 return 0;
2087 }
2088
2089 /// Compute the next cycle at which the given processor resource unit
2090 /// can be scheduled.
getNextResourceCycleByInstance(unsigned InstanceIdx,unsigned Cycles)2091 unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
2092 unsigned Cycles) {
2093 unsigned NextUnreserved = ReservedCycles[InstanceIdx];
2094 // If this resource has never been used, always return cycle zero.
2095 if (NextUnreserved == InvalidCycle)
2096 return 0;
2097 // For bottom-up scheduling add the cycles needed for the current operation.
2098 if (!isTop())
2099 NextUnreserved += Cycles;
2100 return NextUnreserved;
2101 }
2102
2103 /// Compute the next cycle at which the given processor resource can be
2104 /// scheduled. Returns the next cycle and the index of the processor resource
2105 /// instance in the reserved cycles vector.
2106 std::pair<unsigned, unsigned>
getNextResourceCycle(const MCSchedClassDesc * SC,unsigned PIdx,unsigned Cycles)2107 SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
2108 unsigned Cycles) {
2109
2110 unsigned MinNextUnreserved = InvalidCycle;
2111 unsigned InstanceIdx = 0;
2112 unsigned StartIndex = ReservedCyclesIndex[PIdx];
2113 unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits;
2114 assert(NumberOfInstances > 0 &&
2115 "Cannot have zero instances of a ProcResource");
2116
2117 if (isUnbufferedGroup(PIdx)) {
2118 // If any subunits are used by the instruction, report that the resource
2119 // group is available at 0, effectively removing the group record from
2120 // hazarding and basing the hazarding decisions on the subunit records.
2121 // Otherwise, choose the first available instance from among the subunits.
2122 // Specifications which assign cycles to both the subunits and the group or
2123 // which use an unbuffered group with buffered subunits will appear to
2124 // schedule strangely. In the first case, the additional cycles for the
2125 // group will be ignored. In the second, the group will be ignored
2126 // entirely.
2127 for (const MCWriteProcResEntry &PE :
2128 make_range(SchedModel->getWriteProcResBegin(SC),
2129 SchedModel->getWriteProcResEnd(SC)))
2130 if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx])
2131 return std::make_pair(0u, StartIndex);
2132
2133 auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin;
2134 for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) {
2135 unsigned NextUnreserved, NextInstanceIdx;
2136 std::tie(NextUnreserved, NextInstanceIdx) =
2137 getNextResourceCycle(SC, SubUnits[I], Cycles);
2138 if (MinNextUnreserved > NextUnreserved) {
2139 InstanceIdx = NextInstanceIdx;
2140 MinNextUnreserved = NextUnreserved;
2141 }
2142 }
2143 return std::make_pair(MinNextUnreserved, InstanceIdx);
2144 }
2145
2146 for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;
2147 ++I) {
2148 unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles);
2149 if (MinNextUnreserved > NextUnreserved) {
2150 InstanceIdx = I;
2151 MinNextUnreserved = NextUnreserved;
2152 }
2153 }
2154 return std::make_pair(MinNextUnreserved, InstanceIdx);
2155 }
2156
2157 /// Does this SU have a hazard within the current instruction group.
2158 ///
2159 /// The scheduler supports two modes of hazard recognition. The first is the
2160 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
2161 /// supports highly complicated in-order reservation tables
2162 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
2163 ///
2164 /// The second is a streamlined mechanism that checks for hazards based on
2165 /// simple counters that the scheduler itself maintains. It explicitly checks
2166 /// for instruction dispatch limitations, including the number of micro-ops that
2167 /// can dispatch per cycle.
2168 ///
2169 /// TODO: Also check whether the SU must start a new group.
checkHazard(SUnit * SU)2170 bool SchedBoundary::checkHazard(SUnit *SU) {
2171 if (HazardRec->isEnabled()
2172 && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
2173 return true;
2174 }
2175
2176 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
2177 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
2178 LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops="
2179 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
2180 return true;
2181 }
2182
2183 if (CurrMOps > 0 &&
2184 ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
2185 (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
2186 LLVM_DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must "
2187 << (isTop() ? "begin" : "end") << " group\n");
2188 return true;
2189 }
2190
2191 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
2192 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2193 for (const MCWriteProcResEntry &PE :
2194 make_range(SchedModel->getWriteProcResBegin(SC),
2195 SchedModel->getWriteProcResEnd(SC))) {
2196 unsigned ResIdx = PE.ProcResourceIdx;
2197 unsigned Cycles = PE.Cycles;
2198 unsigned NRCycle, InstanceIdx;
2199 std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);
2200 if (NRCycle > CurrCycle) {
2201 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2202 MaxObservedStall = std::max(Cycles, MaxObservedStall);
2203 #endif
2204 LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") "
2205 << SchedModel->getResourceName(ResIdx)
2206 << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx] << ']'
2207 << "=" << NRCycle << "c\n");
2208 return true;
2209 }
2210 }
2211 }
2212 return false;
2213 }
2214
2215 // Find the unscheduled node in ReadySUs with the highest latency.
2216 unsigned SchedBoundary::
findMaxLatency(ArrayRef<SUnit * > ReadySUs)2217 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
2218 SUnit *LateSU = nullptr;
2219 unsigned RemLatency = 0;
2220 for (SUnit *SU : ReadySUs) {
2221 unsigned L = getUnscheduledLatency(SU);
2222 if (L > RemLatency) {
2223 RemLatency = L;
2224 LateSU = SU;
2225 }
2226 }
2227 if (LateSU) {
2228 LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
2229 << LateSU->NodeNum << ") " << RemLatency << "c\n");
2230 }
2231 return RemLatency;
2232 }
2233
2234 // Count resources in this zone and the remaining unscheduled
2235 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
2236 // resource index, or zero if the zone is issue limited.
2237 unsigned SchedBoundary::
getOtherResourceCount(unsigned & OtherCritIdx)2238 getOtherResourceCount(unsigned &OtherCritIdx) {
2239 OtherCritIdx = 0;
2240 if (!SchedModel->hasInstrSchedModel())
2241 return 0;
2242
2243 unsigned OtherCritCount = Rem->RemIssueCount
2244 + (RetiredMOps * SchedModel->getMicroOpFactor());
2245 LLVM_DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: "
2246 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
2247 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
2248 PIdx != PEnd; ++PIdx) {
2249 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
2250 if (OtherCount > OtherCritCount) {
2251 OtherCritCount = OtherCount;
2252 OtherCritIdx = PIdx;
2253 }
2254 }
2255 if (OtherCritIdx) {
2256 LLVM_DEBUG(
2257 dbgs() << " " << Available.getName() << " + Remain CritRes: "
2258 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
2259 << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
2260 }
2261 return OtherCritCount;
2262 }
2263
releaseNode(SUnit * SU,unsigned ReadyCycle,bool InPQueue,unsigned Idx)2264 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
2265 unsigned Idx) {
2266 assert(SU->getInstr() && "Scheduled SUnit must have instr");
2267
2268 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2269 // ReadyCycle was been bumped up to the CurrCycle when this node was
2270 // scheduled, but CurrCycle may have been eagerly advanced immediately after
2271 // scheduling, so may now be greater than ReadyCycle.
2272 if (ReadyCycle > CurrCycle)
2273 MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
2274 #endif
2275
2276 if (ReadyCycle < MinReadyCycle)
2277 MinReadyCycle = ReadyCycle;
2278
2279 // Check for interlocks first. For the purpose of other heuristics, an
2280 // instruction that cannot issue appears as if it's not in the ReadyQueue.
2281 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2282 bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) ||
2283 checkHazard(SU) || (Available.size() >= ReadyListLimit);
2284
2285 if (!HazardDetected) {
2286 Available.push(SU);
2287
2288 if (InPQueue)
2289 Pending.remove(Pending.begin() + Idx);
2290 return;
2291 }
2292
2293 if (!InPQueue)
2294 Pending.push(SU);
2295 }
2296
2297 /// Move the boundary of scheduled code by one cycle.
bumpCycle(unsigned NextCycle)2298 void SchedBoundary::bumpCycle(unsigned NextCycle) {
2299 if (SchedModel->getMicroOpBufferSize() == 0) {
2300 assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
2301 "MinReadyCycle uninitialized");
2302 if (MinReadyCycle > NextCycle)
2303 NextCycle = MinReadyCycle;
2304 }
2305 // Update the current micro-ops, which will issue in the next cycle.
2306 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
2307 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
2308
2309 // Decrement DependentLatency based on the next cycle.
2310 if ((NextCycle - CurrCycle) > DependentLatency)
2311 DependentLatency = 0;
2312 else
2313 DependentLatency -= (NextCycle - CurrCycle);
2314
2315 if (!HazardRec->isEnabled()) {
2316 // Bypass HazardRec virtual calls.
2317 CurrCycle = NextCycle;
2318 } else {
2319 // Bypass getHazardType calls in case of long latency.
2320 for (; CurrCycle != NextCycle; ++CurrCycle) {
2321 if (isTop())
2322 HazardRec->AdvanceCycle();
2323 else
2324 HazardRec->RecedeCycle();
2325 }
2326 }
2327 CheckPending = true;
2328 IsResourceLimited =
2329 checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2330 getScheduledLatency(), true);
2331
2332 LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
2333 << '\n');
2334 }
2335
incExecutedResources(unsigned PIdx,unsigned Count)2336 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
2337 ExecutedResCounts[PIdx] += Count;
2338 if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
2339 MaxExecutedResCount = ExecutedResCounts[PIdx];
2340 }
2341
2342 /// Add the given processor resource to this scheduled zone.
2343 ///
2344 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles
2345 /// during which this resource is consumed.
2346 ///
2347 /// \return the next cycle at which the instruction may execute without
2348 /// oversubscribing resources.
countResource(const MCSchedClassDesc * SC,unsigned PIdx,unsigned Cycles,unsigned NextCycle)2349 unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
2350 unsigned Cycles, unsigned NextCycle) {
2351 unsigned Factor = SchedModel->getResourceFactor(PIdx);
2352 unsigned Count = Factor * Cycles;
2353 LLVM_DEBUG(dbgs() << " " << SchedModel->getResourceName(PIdx) << " +"
2354 << Cycles << "x" << Factor << "u\n");
2355
2356 // Update Executed resources counts.
2357 incExecutedResources(PIdx, Count);
2358 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
2359 Rem->RemainingCounts[PIdx] -= Count;
2360
2361 // Check if this resource exceeds the current critical resource. If so, it
2362 // becomes the critical resource.
2363 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
2364 ZoneCritResIdx = PIdx;
2365 LLVM_DEBUG(dbgs() << " *** Critical resource "
2366 << SchedModel->getResourceName(PIdx) << ": "
2367 << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
2368 << "c\n");
2369 }
2370 // For reserved resources, record the highest cycle using the resource.
2371 unsigned NextAvailable, InstanceIdx;
2372 std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles);
2373 if (NextAvailable > CurrCycle) {
2374 LLVM_DEBUG(dbgs() << " Resource conflict: "
2375 << SchedModel->getResourceName(PIdx)
2376 << '[' << InstanceIdx - ReservedCyclesIndex[PIdx] << ']'
2377 << " reserved until @" << NextAvailable << "\n");
2378 }
2379 return NextAvailable;
2380 }
2381
2382 /// Move the boundary of scheduled code by one SUnit.
bumpNode(SUnit * SU)2383 void SchedBoundary::bumpNode(SUnit *SU) {
2384 // Update the reservation table.
2385 if (HazardRec->isEnabled()) {
2386 if (!isTop() && SU->isCall) {
2387 // Calls are scheduled with their preceding instructions. For bottom-up
2388 // scheduling, clear the pipeline state before emitting.
2389 HazardRec->Reset();
2390 }
2391 HazardRec->EmitInstruction(SU);
2392 // Scheduling an instruction may have made pending instructions available.
2393 CheckPending = true;
2394 }
2395 // checkHazard should prevent scheduling multiple instructions per cycle that
2396 // exceed the issue width.
2397 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2398 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
2399 assert(
2400 (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
2401 "Cannot schedule this instruction's MicroOps in the current cycle.");
2402
2403 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2404 LLVM_DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n");
2405
2406 unsigned NextCycle = CurrCycle;
2407 switch (SchedModel->getMicroOpBufferSize()) {
2408 case 0:
2409 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
2410 break;
2411 case 1:
2412 if (ReadyCycle > NextCycle) {
2413 NextCycle = ReadyCycle;
2414 LLVM_DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n");
2415 }
2416 break;
2417 default:
2418 // We don't currently model the OOO reorder buffer, so consider all
2419 // scheduled MOps to be "retired". We do loosely model in-order resource
2420 // latency. If this instruction uses an in-order resource, account for any
2421 // likely stall cycles.
2422 if (SU->isUnbuffered && ReadyCycle > NextCycle)
2423 NextCycle = ReadyCycle;
2424 break;
2425 }
2426 RetiredMOps += IncMOps;
2427
2428 // Update resource counts and critical resource.
2429 if (SchedModel->hasInstrSchedModel()) {
2430 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
2431 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
2432 Rem->RemIssueCount -= DecRemIssue;
2433 if (ZoneCritResIdx) {
2434 // Scale scheduled micro-ops for comparing with the critical resource.
2435 unsigned ScaledMOps =
2436 RetiredMOps * SchedModel->getMicroOpFactor();
2437
2438 // If scaled micro-ops are now more than the previous critical resource by
2439 // a full cycle, then micro-ops issue becomes critical.
2440 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
2441 >= (int)SchedModel->getLatencyFactor()) {
2442 ZoneCritResIdx = 0;
2443 LLVM_DEBUG(dbgs() << " *** Critical resource NumMicroOps: "
2444 << ScaledMOps / SchedModel->getLatencyFactor()
2445 << "c\n");
2446 }
2447 }
2448 for (TargetSchedModel::ProcResIter
2449 PI = SchedModel->getWriteProcResBegin(SC),
2450 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2451 unsigned RCycle =
2452 countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle);
2453 if (RCycle > NextCycle)
2454 NextCycle = RCycle;
2455 }
2456 if (SU->hasReservedResource) {
2457 // For reserved resources, record the highest cycle using the resource.
2458 // For top-down scheduling, this is the cycle in which we schedule this
2459 // instruction plus the number of cycles the operations reserves the
2460 // resource. For bottom-up is it simply the instruction's cycle.
2461 for (TargetSchedModel::ProcResIter
2462 PI = SchedModel->getWriteProcResBegin(SC),
2463 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2464 unsigned PIdx = PI->ProcResourceIdx;
2465 if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
2466 unsigned ReservedUntil, InstanceIdx;
2467 std::tie(ReservedUntil, InstanceIdx) =
2468 getNextResourceCycle(SC, PIdx, 0);
2469 if (isTop()) {
2470 ReservedCycles[InstanceIdx] =
2471 std::max(ReservedUntil, NextCycle + PI->Cycles);
2472 } else
2473 ReservedCycles[InstanceIdx] = NextCycle;
2474 }
2475 }
2476 }
2477 }
2478 // Update ExpectedLatency and DependentLatency.
2479 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
2480 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
2481 if (SU->getDepth() > TopLatency) {
2482 TopLatency = SU->getDepth();
2483 LLVM_DEBUG(dbgs() << " " << Available.getName() << " TopLatency SU("
2484 << SU->NodeNum << ") " << TopLatency << "c\n");
2485 }
2486 if (SU->getHeight() > BotLatency) {
2487 BotLatency = SU->getHeight();
2488 LLVM_DEBUG(dbgs() << " " << Available.getName() << " BotLatency SU("
2489 << SU->NodeNum << ") " << BotLatency << "c\n");
2490 }
2491 // If we stall for any reason, bump the cycle.
2492 if (NextCycle > CurrCycle)
2493 bumpCycle(NextCycle);
2494 else
2495 // After updating ZoneCritResIdx and ExpectedLatency, check if we're
2496 // resource limited. If a stall occurred, bumpCycle does this.
2497 IsResourceLimited =
2498 checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2499 getScheduledLatency(), true);
2500
2501 // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
2502 // resets CurrMOps. Loop to handle instructions with more MOps than issue in
2503 // one cycle. Since we commonly reach the max MOps here, opportunistically
2504 // bump the cycle to avoid uselessly checking everything in the readyQ.
2505 CurrMOps += IncMOps;
2506
2507 // Bump the cycle count for issue group constraints.
2508 // This must be done after NextCycle has been adjust for all other stalls.
2509 // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
2510 // currCycle to X.
2511 if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) ||
2512 (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
2513 LLVM_DEBUG(dbgs() << " Bump cycle to " << (isTop() ? "end" : "begin")
2514 << " group\n");
2515 bumpCycle(++NextCycle);
2516 }
2517
2518 while (CurrMOps >= SchedModel->getIssueWidth()) {
2519 LLVM_DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle "
2520 << CurrCycle << '\n');
2521 bumpCycle(++NextCycle);
2522 }
2523 LLVM_DEBUG(dumpScheduledState());
2524 }
2525
2526 /// Release pending ready nodes in to the available queue. This makes them
2527 /// visible to heuristics.
releasePending()2528 void SchedBoundary::releasePending() {
2529 // If the available queue is empty, it is safe to reset MinReadyCycle.
2530 if (Available.empty())
2531 MinReadyCycle = std::numeric_limits<unsigned>::max();
2532
2533 // Check to see if any of the pending instructions are ready to issue. If
2534 // so, add them to the available queue.
2535 for (unsigned I = 0, E = Pending.size(); I < E; ++I) {
2536 SUnit *SU = *(Pending.begin() + I);
2537 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
2538
2539 if (ReadyCycle < MinReadyCycle)
2540 MinReadyCycle = ReadyCycle;
2541
2542 if (Available.size() >= ReadyListLimit)
2543 break;
2544
2545 releaseNode(SU, ReadyCycle, true, I);
2546 if (E != Pending.size()) {
2547 --I;
2548 --E;
2549 }
2550 }
2551 CheckPending = false;
2552 }
2553
2554 /// Remove SU from the ready set for this boundary.
removeReady(SUnit * SU)2555 void SchedBoundary::removeReady(SUnit *SU) {
2556 if (Available.isInQueue(SU))
2557 Available.remove(Available.find(SU));
2558 else {
2559 assert(Pending.isInQueue(SU) && "bad ready count");
2560 Pending.remove(Pending.find(SU));
2561 }
2562 }
2563
2564 /// If this queue only has one ready candidate, return it. As a side effect,
2565 /// defer any nodes that now hit a hazard, and advance the cycle until at least
2566 /// one node is ready. If multiple instructions are ready, return NULL.
pickOnlyChoice()2567 SUnit *SchedBoundary::pickOnlyChoice() {
2568 if (CheckPending)
2569 releasePending();
2570
2571 // Defer any ready instrs that now have a hazard.
2572 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
2573 if (checkHazard(*I)) {
2574 Pending.push(*I);
2575 I = Available.remove(I);
2576 continue;
2577 }
2578 ++I;
2579 }
2580 for (unsigned i = 0; Available.empty(); ++i) {
2581 // FIXME: Re-enable assert once PR20057 is resolved.
2582 // assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
2583 // "permanent hazard");
2584 (void)i;
2585 bumpCycle(CurrCycle + 1);
2586 releasePending();
2587 }
2588
2589 LLVM_DEBUG(Pending.dump());
2590 LLVM_DEBUG(Available.dump());
2591
2592 if (Available.size() == 1)
2593 return *Available.begin();
2594 return nullptr;
2595 }
2596
2597 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2598
2599 /// Dump the content of the \ref ReservedCycles vector for the
2600 /// resources that are used in the basic block.
2601 ///
dumpReservedCycles() const2602 LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const {
2603 if (!SchedModel->hasInstrSchedModel())
2604 return;
2605
2606 unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2607 unsigned StartIdx = 0;
2608
2609 for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) {
2610 const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits;
2611 std::string ResName = SchedModel->getResourceName(ResIdx);
2612 for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) {
2613 dbgs() << ResName << "(" << UnitIdx
2614 << ") = " << ReservedCycles[StartIdx + UnitIdx] << "\n";
2615 }
2616 StartIdx += NumUnits;
2617 }
2618 }
2619
2620 // This is useful information to dump after bumpNode.
2621 // Note that the Queue contents are more useful before pickNodeFromQueue.
dumpScheduledState() const2622 LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
2623 unsigned ResFactor;
2624 unsigned ResCount;
2625 if (ZoneCritResIdx) {
2626 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
2627 ResCount = getResourceCount(ZoneCritResIdx);
2628 } else {
2629 ResFactor = SchedModel->getMicroOpFactor();
2630 ResCount = RetiredMOps * ResFactor;
2631 }
2632 unsigned LFactor = SchedModel->getLatencyFactor();
2633 dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
2634 << " Retired: " << RetiredMOps;
2635 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c";
2636 dbgs() << "\n Critical: " << ResCount / LFactor << "c, "
2637 << ResCount / ResFactor << " "
2638 << SchedModel->getResourceName(ZoneCritResIdx)
2639 << "\n ExpectedLatency: " << ExpectedLatency << "c\n"
2640 << (IsResourceLimited ? " - Resource" : " - Latency")
2641 << " limited.\n";
2642 if (MISchedDumpReservedCycles)
2643 dumpReservedCycles();
2644 }
2645 #endif
2646
2647 //===----------------------------------------------------------------------===//
2648 // GenericScheduler - Generic implementation of MachineSchedStrategy.
2649 //===----------------------------------------------------------------------===//
2650
2651 void GenericSchedulerBase::SchedCandidate::
initResourceDelta(const ScheduleDAGMI * DAG,const TargetSchedModel * SchedModel)2652 initResourceDelta(const ScheduleDAGMI *DAG,
2653 const TargetSchedModel *SchedModel) {
2654 if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2655 return;
2656
2657 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2658 for (TargetSchedModel::ProcResIter
2659 PI = SchedModel->getWriteProcResBegin(SC),
2660 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2661 if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2662 ResDelta.CritResources += PI->Cycles;
2663 if (PI->ProcResourceIdx == Policy.DemandResIdx)
2664 ResDelta.DemandedResources += PI->Cycles;
2665 }
2666 }
2667
2668 /// Compute remaining latency. We need this both to determine whether the
2669 /// overall schedule has become latency-limited and whether the instructions
2670 /// outside this zone are resource or latency limited.
2671 ///
2672 /// The "dependent" latency is updated incrementally during scheduling as the
2673 /// max height/depth of scheduled nodes minus the cycles since it was
2674 /// scheduled:
2675 /// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
2676 ///
2677 /// The "independent" latency is the max ready queue depth:
2678 /// ILat = max N.depth for N in Available|Pending
2679 ///
2680 /// RemainingLatency is the greater of independent and dependent latency.
2681 ///
2682 /// These computations are expensive, especially in DAGs with many edges, so
2683 /// only do them if necessary.
computeRemLatency(SchedBoundary & CurrZone)2684 static unsigned computeRemLatency(SchedBoundary &CurrZone) {
2685 unsigned RemLatency = CurrZone.getDependentLatency();
2686 RemLatency = std::max(RemLatency,
2687 CurrZone.findMaxLatency(CurrZone.Available.elements()));
2688 RemLatency = std::max(RemLatency,
2689 CurrZone.findMaxLatency(CurrZone.Pending.elements()));
2690 return RemLatency;
2691 }
2692
2693 /// Returns true if the current cycle plus remaning latency is greater than
2694 /// the critical path in the scheduling region.
shouldReduceLatency(const CandPolicy & Policy,SchedBoundary & CurrZone,bool ComputeRemLatency,unsigned & RemLatency) const2695 bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
2696 SchedBoundary &CurrZone,
2697 bool ComputeRemLatency,
2698 unsigned &RemLatency) const {
2699 // The current cycle is already greater than the critical path, so we are
2700 // already latency limited and don't need to compute the remaining latency.
2701 if (CurrZone.getCurrCycle() > Rem.CriticalPath)
2702 return true;
2703
2704 // If we haven't scheduled anything yet, then we aren't latency limited.
2705 if (CurrZone.getCurrCycle() == 0)
2706 return false;
2707
2708 if (ComputeRemLatency)
2709 RemLatency = computeRemLatency(CurrZone);
2710
2711 return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
2712 }
2713
2714 /// Set the CandPolicy given a scheduling zone given the current resources and
2715 /// latencies inside and outside the zone.
setPolicy(CandPolicy & Policy,bool IsPostRA,SchedBoundary & CurrZone,SchedBoundary * OtherZone)2716 void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
2717 SchedBoundary &CurrZone,
2718 SchedBoundary *OtherZone) {
2719 // Apply preemptive heuristics based on the total latency and resources
2720 // inside and outside this zone. Potential stalls should be considered before
2721 // following this policy.
2722
2723 // Compute the critical resource outside the zone.
2724 unsigned OtherCritIdx = 0;
2725 unsigned OtherCount =
2726 OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
2727
2728 bool OtherResLimited = false;
2729 unsigned RemLatency = 0;
2730 bool RemLatencyComputed = false;
2731 if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
2732 RemLatency = computeRemLatency(CurrZone);
2733 RemLatencyComputed = true;
2734 OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
2735 OtherCount, RemLatency, false);
2736 }
2737
2738 // Schedule aggressively for latency in PostRA mode. We don't check for
2739 // acyclic latency during PostRA, and highly out-of-order processors will
2740 // skip PostRA scheduling.
2741 if (!OtherResLimited &&
2742 (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
2743 RemLatency))) {
2744 Policy.ReduceLatency |= true;
2745 LLVM_DEBUG(dbgs() << " " << CurrZone.Available.getName()
2746 << " RemainingLatency " << RemLatency << " + "
2747 << CurrZone.getCurrCycle() << "c > CritPath "
2748 << Rem.CriticalPath << "\n");
2749 }
2750 // If the same resource is limiting inside and outside the zone, do nothing.
2751 if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
2752 return;
2753
2754 LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
2755 dbgs() << " " << CurrZone.Available.getName() << " ResourceLimited: "
2756 << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
2757 } if (OtherResLimited) dbgs()
2758 << " RemainingLimit: "
2759 << SchedModel->getResourceName(OtherCritIdx) << "\n";
2760 if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
2761 << " Latency limited both directions.\n");
2762
2763 if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
2764 Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
2765
2766 if (OtherResLimited)
2767 Policy.DemandResIdx = OtherCritIdx;
2768 }
2769
2770 #ifndef NDEBUG
getReasonStr(GenericSchedulerBase::CandReason Reason)2771 const char *GenericSchedulerBase::getReasonStr(
2772 GenericSchedulerBase::CandReason Reason) {
2773 switch (Reason) {
2774 case NoCand: return "NOCAND ";
2775 case Only1: return "ONLY1 ";
2776 case PhysReg: return "PHYS-REG ";
2777 case RegExcess: return "REG-EXCESS";
2778 case RegCritical: return "REG-CRIT ";
2779 case Stall: return "STALL ";
2780 case Cluster: return "CLUSTER ";
2781 case Weak: return "WEAK ";
2782 case RegMax: return "REG-MAX ";
2783 case ResourceReduce: return "RES-REDUCE";
2784 case ResourceDemand: return "RES-DEMAND";
2785 case TopDepthReduce: return "TOP-DEPTH ";
2786 case TopPathReduce: return "TOP-PATH ";
2787 case BotHeightReduce:return "BOT-HEIGHT";
2788 case BotPathReduce: return "BOT-PATH ";
2789 case NextDefUse: return "DEF-USE ";
2790 case NodeOrder: return "ORDER ";
2791 };
2792 llvm_unreachable("Unknown reason!");
2793 }
2794
traceCandidate(const SchedCandidate & Cand)2795 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
2796 PressureChange P;
2797 unsigned ResIdx = 0;
2798 unsigned Latency = 0;
2799 switch (Cand.Reason) {
2800 default:
2801 break;
2802 case RegExcess:
2803 P = Cand.RPDelta.Excess;
2804 break;
2805 case RegCritical:
2806 P = Cand.RPDelta.CriticalMax;
2807 break;
2808 case RegMax:
2809 P = Cand.RPDelta.CurrentMax;
2810 break;
2811 case ResourceReduce:
2812 ResIdx = Cand.Policy.ReduceResIdx;
2813 break;
2814 case ResourceDemand:
2815 ResIdx = Cand.Policy.DemandResIdx;
2816 break;
2817 case TopDepthReduce:
2818 Latency = Cand.SU->getDepth();
2819 break;
2820 case TopPathReduce:
2821 Latency = Cand.SU->getHeight();
2822 break;
2823 case BotHeightReduce:
2824 Latency = Cand.SU->getHeight();
2825 break;
2826 case BotPathReduce:
2827 Latency = Cand.SU->getDepth();
2828 break;
2829 }
2830 dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
2831 if (P.isValid())
2832 dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
2833 << ":" << P.getUnitInc() << " ";
2834 else
2835 dbgs() << " ";
2836 if (ResIdx)
2837 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
2838 else
2839 dbgs() << " ";
2840 if (Latency)
2841 dbgs() << " " << Latency << " cycles ";
2842 else
2843 dbgs() << " ";
2844 dbgs() << '\n';
2845 }
2846 #endif
2847
2848 namespace llvm {
2849 /// Return true if this heuristic determines order.
2850 /// TODO: Consider refactor return type of these functions as integer or enum,
2851 /// as we may need to differentiate whether TryCand is better than Cand.
tryLess(int TryVal,int CandVal,GenericSchedulerBase::SchedCandidate & TryCand,GenericSchedulerBase::SchedCandidate & Cand,GenericSchedulerBase::CandReason Reason)2852 bool tryLess(int TryVal, int CandVal,
2853 GenericSchedulerBase::SchedCandidate &TryCand,
2854 GenericSchedulerBase::SchedCandidate &Cand,
2855 GenericSchedulerBase::CandReason Reason) {
2856 if (TryVal < CandVal) {
2857 TryCand.Reason = Reason;
2858 return true;
2859 }
2860 if (TryVal > CandVal) {
2861 if (Cand.Reason > Reason)
2862 Cand.Reason = Reason;
2863 return true;
2864 }
2865 return false;
2866 }
2867
tryGreater(int TryVal,int CandVal,GenericSchedulerBase::SchedCandidate & TryCand,GenericSchedulerBase::SchedCandidate & Cand,GenericSchedulerBase::CandReason Reason)2868 bool tryGreater(int TryVal, int CandVal,
2869 GenericSchedulerBase::SchedCandidate &TryCand,
2870 GenericSchedulerBase::SchedCandidate &Cand,
2871 GenericSchedulerBase::CandReason Reason) {
2872 if (TryVal > CandVal) {
2873 TryCand.Reason = Reason;
2874 return true;
2875 }
2876 if (TryVal < CandVal) {
2877 if (Cand.Reason > Reason)
2878 Cand.Reason = Reason;
2879 return true;
2880 }
2881 return false;
2882 }
2883
tryLatency(GenericSchedulerBase::SchedCandidate & TryCand,GenericSchedulerBase::SchedCandidate & Cand,SchedBoundary & Zone)2884 bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
2885 GenericSchedulerBase::SchedCandidate &Cand,
2886 SchedBoundary &Zone) {
2887 if (Zone.isTop()) {
2888 // Prefer the candidate with the lesser depth, but only if one of them has
2889 // depth greater than the total latency scheduled so far, otherwise either
2890 // of them could be scheduled now with no stall.
2891 if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
2892 Zone.getScheduledLatency()) {
2893 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2894 TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
2895 return true;
2896 }
2897 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2898 TryCand, Cand, GenericSchedulerBase::TopPathReduce))
2899 return true;
2900 } else {
2901 // Prefer the candidate with the lesser height, but only if one of them has
2902 // height greater than the total latency scheduled so far, otherwise either
2903 // of them could be scheduled now with no stall.
2904 if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
2905 Zone.getScheduledLatency()) {
2906 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2907 TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
2908 return true;
2909 }
2910 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2911 TryCand, Cand, GenericSchedulerBase::BotPathReduce))
2912 return true;
2913 }
2914 return false;
2915 }
2916 } // end namespace llvm
2917
tracePick(GenericSchedulerBase::CandReason Reason,bool IsTop)2918 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
2919 LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2920 << GenericSchedulerBase::getReasonStr(Reason) << '\n');
2921 }
2922
tracePick(const GenericSchedulerBase::SchedCandidate & Cand)2923 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
2924 tracePick(Cand.Reason, Cand.AtTop);
2925 }
2926
initialize(ScheduleDAGMI * dag)2927 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2928 assert(dag->hasVRegLiveness() &&
2929 "(PreRA)GenericScheduler needs vreg liveness");
2930 DAG = static_cast<ScheduleDAGMILive*>(dag);
2931 SchedModel = DAG->getSchedModel();
2932 TRI = DAG->TRI;
2933
2934 if (RegionPolicy.ComputeDFSResult)
2935 DAG->computeDFSResult();
2936
2937 Rem.init(DAG, SchedModel);
2938 Top.init(DAG, SchedModel, &Rem);
2939 Bot.init(DAG, SchedModel, &Rem);
2940
2941 // Initialize resource counts.
2942
2943 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2944 // are disabled, then these HazardRecs will be disabled.
2945 const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2946 if (!Top.HazardRec) {
2947 Top.HazardRec =
2948 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2949 Itin, DAG);
2950 }
2951 if (!Bot.HazardRec) {
2952 Bot.HazardRec =
2953 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2954 Itin, DAG);
2955 }
2956 TopCand.SU = nullptr;
2957 BotCand.SU = nullptr;
2958 }
2959
2960 /// Initialize the per-region scheduling policy.
initPolicy(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,unsigned NumRegionInstrs)2961 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2962 MachineBasicBlock::iterator End,
2963 unsigned NumRegionInstrs) {
2964 const MachineFunction &MF = *Begin->getMF();
2965 const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
2966
2967 // Avoid setting up the register pressure tracker for small regions to save
2968 // compile time. As a rough heuristic, only track pressure when the number of
2969 // schedulable instructions exceeds half the integer register file.
2970 RegionPolicy.ShouldTrackPressure = true;
2971 for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
2972 MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
2973 if (TLI->isTypeLegal(LegalIntVT)) {
2974 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2975 TLI->getRegClassFor(LegalIntVT));
2976 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2977 }
2978 }
2979
2980 // For generic targets, we default to bottom-up, because it's simpler and more
2981 // compile-time optimizations have been implemented in that direction.
2982 RegionPolicy.OnlyBottomUp = true;
2983
2984 // Allow the subtarget to override default policy.
2985 MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
2986
2987 // After subtarget overrides, apply command line options.
2988 if (!EnableRegPressure) {
2989 RegionPolicy.ShouldTrackPressure = false;
2990 RegionPolicy.ShouldTrackLaneMasks = false;
2991 }
2992
2993 // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2994 // e.g. -misched-bottomup=false allows scheduling in both directions.
2995 assert((!ForceTopDown || !ForceBottomUp) &&
2996 "-misched-topdown incompatible with -misched-bottomup");
2997 if (ForceBottomUp.getNumOccurrences() > 0) {
2998 RegionPolicy.OnlyBottomUp = ForceBottomUp;
2999 if (RegionPolicy.OnlyBottomUp)
3000 RegionPolicy.OnlyTopDown = false;
3001 }
3002 if (ForceTopDown.getNumOccurrences() > 0) {
3003 RegionPolicy.OnlyTopDown = ForceTopDown;
3004 if (RegionPolicy.OnlyTopDown)
3005 RegionPolicy.OnlyBottomUp = false;
3006 }
3007 }
3008
dumpPolicy() const3009 void GenericScheduler::dumpPolicy() const {
3010 // Cannot completely remove virtual function even in release mode.
3011 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3012 dbgs() << "GenericScheduler RegionPolicy: "
3013 << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
3014 << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
3015 << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
3016 << "\n";
3017 #endif
3018 }
3019
3020 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
3021 /// critical path by more cycles than it takes to drain the instruction buffer.
3022 /// We estimate an upper bounds on in-flight instructions as:
3023 ///
3024 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
3025 /// InFlightIterations = AcyclicPath / CyclesPerIteration
3026 /// InFlightResources = InFlightIterations * LoopResources
3027 ///
3028 /// TODO: Check execution resources in addition to IssueCount.
checkAcyclicLatency()3029 void GenericScheduler::checkAcyclicLatency() {
3030 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
3031 return;
3032
3033 // Scaled number of cycles per loop iteration.
3034 unsigned IterCount =
3035 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
3036 Rem.RemIssueCount);
3037 // Scaled acyclic critical path.
3038 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
3039 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
3040 unsigned InFlightCount =
3041 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
3042 unsigned BufferLimit =
3043 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
3044
3045 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
3046
3047 LLVM_DEBUG(
3048 dbgs() << "IssueCycles="
3049 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
3050 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
3051 << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
3052 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
3053 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
3054 if (Rem.IsAcyclicLatencyLimited) dbgs() << " ACYCLIC LATENCY LIMIT\n");
3055 }
3056
registerRoots()3057 void GenericScheduler::registerRoots() {
3058 Rem.CriticalPath = DAG->ExitSU.getDepth();
3059
3060 // Some roots may not feed into ExitSU. Check all of them in case.
3061 for (const SUnit *SU : Bot.Available) {
3062 if (SU->getDepth() > Rem.CriticalPath)
3063 Rem.CriticalPath = SU->getDepth();
3064 }
3065 LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
3066 if (DumpCriticalPathLength) {
3067 errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
3068 }
3069
3070 if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
3071 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
3072 checkAcyclicLatency();
3073 }
3074 }
3075
3076 namespace llvm {
tryPressure(const PressureChange & TryP,const PressureChange & CandP,GenericSchedulerBase::SchedCandidate & TryCand,GenericSchedulerBase::SchedCandidate & Cand,GenericSchedulerBase::CandReason Reason,const TargetRegisterInfo * TRI,const MachineFunction & MF)3077 bool tryPressure(const PressureChange &TryP,
3078 const PressureChange &CandP,
3079 GenericSchedulerBase::SchedCandidate &TryCand,
3080 GenericSchedulerBase::SchedCandidate &Cand,
3081 GenericSchedulerBase::CandReason Reason,
3082 const TargetRegisterInfo *TRI,
3083 const MachineFunction &MF) {
3084 // If one candidate decreases and the other increases, go with it.
3085 // Invalid candidates have UnitInc==0.
3086 if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
3087 Reason)) {
3088 return true;
3089 }
3090 // Do not compare the magnitude of pressure changes between top and bottom
3091 // boundary.
3092 if (Cand.AtTop != TryCand.AtTop)
3093 return false;
3094
3095 // If both candidates affect the same set in the same boundary, go with the
3096 // smallest increase.
3097 unsigned TryPSet = TryP.getPSetOrMax();
3098 unsigned CandPSet = CandP.getPSetOrMax();
3099 if (TryPSet == CandPSet) {
3100 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
3101 Reason);
3102 }
3103
3104 int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
3105 std::numeric_limits<int>::max();
3106
3107 int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
3108 std::numeric_limits<int>::max();
3109
3110 // If the candidates are decreasing pressure, reverse priority.
3111 if (TryP.getUnitInc() < 0)
3112 std::swap(TryRank, CandRank);
3113 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
3114 }
3115
getWeakLeft(const SUnit * SU,bool isTop)3116 unsigned getWeakLeft(const SUnit *SU, bool isTop) {
3117 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
3118 }
3119
3120 /// Minimize physical register live ranges. Regalloc wants them adjacent to
3121 /// their physreg def/use.
3122 ///
3123 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
3124 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
3125 /// with the operation that produces or consumes the physreg. We'll do this when
3126 /// regalloc has support for parallel copies.
biasPhysReg(const SUnit * SU,bool isTop)3127 int biasPhysReg(const SUnit *SU, bool isTop) {
3128 const MachineInstr *MI = SU->getInstr();
3129
3130 if (MI->isCopy()) {
3131 unsigned ScheduledOper = isTop ? 1 : 0;
3132 unsigned UnscheduledOper = isTop ? 0 : 1;
3133 // If we have already scheduled the physreg produce/consumer, immediately
3134 // schedule the copy.
3135 if (MI->getOperand(ScheduledOper).getReg().isPhysical())
3136 return 1;
3137 // If the physreg is at the boundary, defer it. Otherwise schedule it
3138 // immediately to free the dependent. We can hoist the copy later.
3139 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
3140 if (MI->getOperand(UnscheduledOper).getReg().isPhysical())
3141 return AtBoundary ? -1 : 1;
3142 }
3143
3144 if (MI->isMoveImmediate()) {
3145 // If we have a move immediate and all successors have been assigned, bias
3146 // towards scheduling this later. Make sure all register defs are to
3147 // physical registers.
3148 bool DoBias = true;
3149 for (const MachineOperand &Op : MI->defs()) {
3150 if (Op.isReg() && !Op.getReg().isPhysical()) {
3151 DoBias = false;
3152 break;
3153 }
3154 }
3155
3156 if (DoBias)
3157 return isTop ? -1 : 1;
3158 }
3159
3160 return 0;
3161 }
3162 } // end namespace llvm
3163
initCandidate(SchedCandidate & Cand,SUnit * SU,bool AtTop,const RegPressureTracker & RPTracker,RegPressureTracker & TempTracker)3164 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
3165 bool AtTop,
3166 const RegPressureTracker &RPTracker,
3167 RegPressureTracker &TempTracker) {
3168 Cand.SU = SU;
3169 Cand.AtTop = AtTop;
3170 if (DAG->isTrackingPressure()) {
3171 if (AtTop) {
3172 TempTracker.getMaxDownwardPressureDelta(
3173 Cand.SU->getInstr(),
3174 Cand.RPDelta,
3175 DAG->getRegionCriticalPSets(),
3176 DAG->getRegPressure().MaxSetPressure);
3177 } else {
3178 if (VerifyScheduling) {
3179 TempTracker.getMaxUpwardPressureDelta(
3180 Cand.SU->getInstr(),
3181 &DAG->getPressureDiff(Cand.SU),
3182 Cand.RPDelta,
3183 DAG->getRegionCriticalPSets(),
3184 DAG->getRegPressure().MaxSetPressure);
3185 } else {
3186 RPTracker.getUpwardPressureDelta(
3187 Cand.SU->getInstr(),
3188 DAG->getPressureDiff(Cand.SU),
3189 Cand.RPDelta,
3190 DAG->getRegionCriticalPSets(),
3191 DAG->getRegPressure().MaxSetPressure);
3192 }
3193 }
3194 }
3195 LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
3196 << " Try SU(" << Cand.SU->NodeNum << ") "
3197 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
3198 << Cand.RPDelta.Excess.getUnitInc() << "\n");
3199 }
3200
3201 /// Apply a set of heuristics to a new candidate. Heuristics are currently
3202 /// hierarchical. This may be more efficient than a graduated cost model because
3203 /// we don't need to evaluate all aspects of the model for each node in the
3204 /// queue. But it's really done to make the heuristics easier to debug and
3205 /// statistically analyze.
3206 ///
3207 /// \param Cand provides the policy and current best candidate.
3208 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3209 /// \param Zone describes the scheduled zone that we are extending, or nullptr
3210 /// if Cand is from a different zone than TryCand.
3211 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand,SchedBoundary * Zone) const3212 bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
3213 SchedCandidate &TryCand,
3214 SchedBoundary *Zone) const {
3215 // Initialize the candidate if needed.
3216 if (!Cand.isValid()) {
3217 TryCand.Reason = NodeOrder;
3218 return true;
3219 }
3220
3221 // Bias PhysReg Defs and copies to their uses and defined respectively.
3222 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
3223 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
3224 return TryCand.Reason != NoCand;
3225
3226 // Avoid exceeding the target's limit.
3227 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
3228 Cand.RPDelta.Excess,
3229 TryCand, Cand, RegExcess, TRI,
3230 DAG->MF))
3231 return TryCand.Reason != NoCand;
3232
3233 // Avoid increasing the max critical pressure in the scheduled region.
3234 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
3235 Cand.RPDelta.CriticalMax,
3236 TryCand, Cand, RegCritical, TRI,
3237 DAG->MF))
3238 return TryCand.Reason != NoCand;
3239
3240 // We only compare a subset of features when comparing nodes between
3241 // Top and Bottom boundary. Some properties are simply incomparable, in many
3242 // other instances we should only override the other boundary if something
3243 // is a clear good pick on one boundary. Skip heuristics that are more
3244 // "tie-breaking" in nature.
3245 bool SameBoundary = Zone != nullptr;
3246 if (SameBoundary) {
3247 // For loops that are acyclic path limited, aggressively schedule for
3248 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
3249 // heuristics to take precedence.
3250 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
3251 tryLatency(TryCand, Cand, *Zone))
3252 return TryCand.Reason != NoCand;
3253
3254 // Prioritize instructions that read unbuffered resources by stall cycles.
3255 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
3256 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3257 return TryCand.Reason != NoCand;
3258 }
3259
3260 // Keep clustered nodes together to encourage downstream peephole
3261 // optimizations which may reduce resource requirements.
3262 //
3263 // This is a best effort to set things up for a post-RA pass. Optimizations
3264 // like generating loads of multiple registers should ideally be done within
3265 // the scheduler pass by combining the loads during DAG postprocessing.
3266 const SUnit *CandNextClusterSU =
3267 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3268 const SUnit *TryCandNextClusterSU =
3269 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3270 if (tryGreater(TryCand.SU == TryCandNextClusterSU,
3271 Cand.SU == CandNextClusterSU,
3272 TryCand, Cand, Cluster))
3273 return TryCand.Reason != NoCand;
3274
3275 if (SameBoundary) {
3276 // Weak edges are for clustering and other constraints.
3277 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
3278 getWeakLeft(Cand.SU, Cand.AtTop),
3279 TryCand, Cand, Weak))
3280 return TryCand.Reason != NoCand;
3281 }
3282
3283 // Avoid increasing the max pressure of the entire region.
3284 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
3285 Cand.RPDelta.CurrentMax,
3286 TryCand, Cand, RegMax, TRI,
3287 DAG->MF))
3288 return TryCand.Reason != NoCand;
3289
3290 if (SameBoundary) {
3291 // Avoid critical resource consumption and balance the schedule.
3292 TryCand.initResourceDelta(DAG, SchedModel);
3293 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3294 TryCand, Cand, ResourceReduce))
3295 return TryCand.Reason != NoCand;
3296 if (tryGreater(TryCand.ResDelta.DemandedResources,
3297 Cand.ResDelta.DemandedResources,
3298 TryCand, Cand, ResourceDemand))
3299 return TryCand.Reason != NoCand;
3300
3301 // Avoid serializing long latency dependence chains.
3302 // For acyclic path limited loops, latency was already checked above.
3303 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
3304 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
3305 return TryCand.Reason != NoCand;
3306
3307 // Fall through to original instruction order.
3308 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
3309 || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
3310 TryCand.Reason = NodeOrder;
3311 return true;
3312 }
3313 }
3314
3315 return false;
3316 }
3317
3318 /// Pick the best candidate from the queue.
3319 ///
3320 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
3321 /// DAG building. To adjust for the current scheduling location we need to
3322 /// maintain the number of vreg uses remaining to be top-scheduled.
pickNodeFromQueue(SchedBoundary & Zone,const CandPolicy & ZonePolicy,const RegPressureTracker & RPTracker,SchedCandidate & Cand)3323 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
3324 const CandPolicy &ZonePolicy,
3325 const RegPressureTracker &RPTracker,
3326 SchedCandidate &Cand) {
3327 // getMaxPressureDelta temporarily modifies the tracker.
3328 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
3329
3330 ReadyQueue &Q = Zone.Available;
3331 for (SUnit *SU : Q) {
3332
3333 SchedCandidate TryCand(ZonePolicy);
3334 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
3335 // Pass SchedBoundary only when comparing nodes from the same boundary.
3336 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
3337 if (tryCandidate(Cand, TryCand, ZoneArg)) {
3338 // Initialize resource delta if needed in case future heuristics query it.
3339 if (TryCand.ResDelta == SchedResourceDelta())
3340 TryCand.initResourceDelta(DAG, SchedModel);
3341 Cand.setBest(TryCand);
3342 LLVM_DEBUG(traceCandidate(Cand));
3343 }
3344 }
3345 }
3346
3347 /// Pick the best candidate node from either the top or bottom queue.
pickNodeBidirectional(bool & IsTopNode)3348 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
3349 // Schedule as far as possible in the direction of no choice. This is most
3350 // efficient, but also provides the best heuristics for CriticalPSets.
3351 if (SUnit *SU = Bot.pickOnlyChoice()) {
3352 IsTopNode = false;
3353 tracePick(Only1, false);
3354 return SU;
3355 }
3356 if (SUnit *SU = Top.pickOnlyChoice()) {
3357 IsTopNode = true;
3358 tracePick(Only1, true);
3359 return SU;
3360 }
3361 // Set the bottom-up policy based on the state of the current bottom zone and
3362 // the instructions outside the zone, including the top zone.
3363 CandPolicy BotPolicy;
3364 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
3365 // Set the top-down policy based on the state of the current top zone and
3366 // the instructions outside the zone, including the bottom zone.
3367 CandPolicy TopPolicy;
3368 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
3369
3370 // See if BotCand is still valid (because we previously scheduled from Top).
3371 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
3372 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
3373 BotCand.Policy != BotPolicy) {
3374 BotCand.reset(CandPolicy());
3375 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
3376 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
3377 } else {
3378 LLVM_DEBUG(traceCandidate(BotCand));
3379 #ifndef NDEBUG
3380 if (VerifyScheduling) {
3381 SchedCandidate TCand;
3382 TCand.reset(CandPolicy());
3383 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
3384 assert(TCand.SU == BotCand.SU &&
3385 "Last pick result should correspond to re-picking right now");
3386 }
3387 #endif
3388 }
3389
3390 // Check if the top Q has a better candidate.
3391 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
3392 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
3393 TopCand.Policy != TopPolicy) {
3394 TopCand.reset(CandPolicy());
3395 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
3396 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
3397 } else {
3398 LLVM_DEBUG(traceCandidate(TopCand));
3399 #ifndef NDEBUG
3400 if (VerifyScheduling) {
3401 SchedCandidate TCand;
3402 TCand.reset(CandPolicy());
3403 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
3404 assert(TCand.SU == TopCand.SU &&
3405 "Last pick result should correspond to re-picking right now");
3406 }
3407 #endif
3408 }
3409
3410 // Pick best from BotCand and TopCand.
3411 assert(BotCand.isValid());
3412 assert(TopCand.isValid());
3413 SchedCandidate Cand = BotCand;
3414 TopCand.Reason = NoCand;
3415 if (tryCandidate(Cand, TopCand, nullptr)) {
3416 Cand.setBest(TopCand);
3417 LLVM_DEBUG(traceCandidate(Cand));
3418 }
3419
3420 IsTopNode = Cand.AtTop;
3421 tracePick(Cand);
3422 return Cand.SU;
3423 }
3424
3425 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
pickNode(bool & IsTopNode)3426 SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
3427 if (DAG->top() == DAG->bottom()) {
3428 assert(Top.Available.empty() && Top.Pending.empty() &&
3429 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
3430 return nullptr;
3431 }
3432 SUnit *SU;
3433 do {
3434 if (RegionPolicy.OnlyTopDown) {
3435 SU = Top.pickOnlyChoice();
3436 if (!SU) {
3437 CandPolicy NoPolicy;
3438 TopCand.reset(NoPolicy);
3439 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
3440 assert(TopCand.Reason != NoCand && "failed to find a candidate");
3441 tracePick(TopCand);
3442 SU = TopCand.SU;
3443 }
3444 IsTopNode = true;
3445 } else if (RegionPolicy.OnlyBottomUp) {
3446 SU = Bot.pickOnlyChoice();
3447 if (!SU) {
3448 CandPolicy NoPolicy;
3449 BotCand.reset(NoPolicy);
3450 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
3451 assert(BotCand.Reason != NoCand && "failed to find a candidate");
3452 tracePick(BotCand);
3453 SU = BotCand.SU;
3454 }
3455 IsTopNode = false;
3456 } else {
3457 SU = pickNodeBidirectional(IsTopNode);
3458 }
3459 } while (SU->isScheduled);
3460
3461 if (SU->isTopReady())
3462 Top.removeReady(SU);
3463 if (SU->isBottomReady())
3464 Bot.removeReady(SU);
3465
3466 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3467 << *SU->getInstr());
3468 return SU;
3469 }
3470
reschedulePhysReg(SUnit * SU,bool isTop)3471 void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
3472 MachineBasicBlock::iterator InsertPos = SU->getInstr();
3473 if (!isTop)
3474 ++InsertPos;
3475 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
3476
3477 // Find already scheduled copies with a single physreg dependence and move
3478 // them just above the scheduled instruction.
3479 for (SDep &Dep : Deps) {
3480 if (Dep.getKind() != SDep::Data ||
3481 !Register::isPhysicalRegister(Dep.getReg()))
3482 continue;
3483 SUnit *DepSU = Dep.getSUnit();
3484 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
3485 continue;
3486 MachineInstr *Copy = DepSU->getInstr();
3487 if (!Copy->isCopy() && !Copy->isMoveImmediate())
3488 continue;
3489 LLVM_DEBUG(dbgs() << " Rescheduling physreg copy ";
3490 DAG->dumpNode(*Dep.getSUnit()));
3491 DAG->moveInstruction(Copy, InsertPos);
3492 }
3493 }
3494
3495 /// Update the scheduler's state after scheduling a node. This is the same node
3496 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
3497 /// update it's state based on the current cycle before MachineSchedStrategy
3498 /// does.
3499 ///
3500 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
3501 /// them here. See comments in biasPhysReg.
schedNode(SUnit * SU,bool IsTopNode)3502 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3503 if (IsTopNode) {
3504 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3505 Top.bumpNode(SU);
3506 if (SU->hasPhysRegUses)
3507 reschedulePhysReg(SU, true);
3508 } else {
3509 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
3510 Bot.bumpNode(SU);
3511 if (SU->hasPhysRegDefs)
3512 reschedulePhysReg(SU, false);
3513 }
3514 }
3515
3516 /// Create the standard converging machine scheduler. This will be used as the
3517 /// default scheduler if the target does not set a default.
createGenericSchedLive(MachineSchedContext * C)3518 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
3519 ScheduleDAGMILive *DAG =
3520 new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
3521 // Register DAG post-processors.
3522 //
3523 // FIXME: extend the mutation API to allow earlier mutations to instantiate
3524 // data and pass it to later mutations. Have a single mutation that gathers
3525 // the interesting nodes in one pass.
3526 DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
3527 return DAG;
3528 }
3529
createConvergingSched(MachineSchedContext * C)3530 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
3531 return createGenericSchedLive(C);
3532 }
3533
3534 static MachineSchedRegistry
3535 GenericSchedRegistry("converge", "Standard converging scheduler.",
3536 createConvergingSched);
3537
3538 //===----------------------------------------------------------------------===//
3539 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
3540 //===----------------------------------------------------------------------===//
3541
initialize(ScheduleDAGMI * Dag)3542 void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
3543 DAG = Dag;
3544 SchedModel = DAG->getSchedModel();
3545 TRI = DAG->TRI;
3546
3547 Rem.init(DAG, SchedModel);
3548 Top.init(DAG, SchedModel, &Rem);
3549 BotRoots.clear();
3550
3551 // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
3552 // or are disabled, then these HazardRecs will be disabled.
3553 const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
3554 if (!Top.HazardRec) {
3555 Top.HazardRec =
3556 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
3557 Itin, DAG);
3558 }
3559 }
3560
registerRoots()3561 void PostGenericScheduler::registerRoots() {
3562 Rem.CriticalPath = DAG->ExitSU.getDepth();
3563
3564 // Some roots may not feed into ExitSU. Check all of them in case.
3565 for (const SUnit *SU : BotRoots) {
3566 if (SU->getDepth() > Rem.CriticalPath)
3567 Rem.CriticalPath = SU->getDepth();
3568 }
3569 LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
3570 if (DumpCriticalPathLength) {
3571 errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
3572 }
3573 }
3574
3575 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
3576 ///
3577 /// \param Cand provides the policy and current best candidate.
3578 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3579 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand)3580 bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
3581 SchedCandidate &TryCand) {
3582 // Initialize the candidate if needed.
3583 if (!Cand.isValid()) {
3584 TryCand.Reason = NodeOrder;
3585 return true;
3586 }
3587
3588 // Prioritize instructions that read unbuffered resources by stall cycles.
3589 if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
3590 Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3591 return TryCand.Reason != NoCand;
3592
3593 // Keep clustered nodes together.
3594 if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
3595 Cand.SU == DAG->getNextClusterSucc(),
3596 TryCand, Cand, Cluster))
3597 return TryCand.Reason != NoCand;
3598
3599 // Avoid critical resource consumption and balance the schedule.
3600 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3601 TryCand, Cand, ResourceReduce))
3602 return TryCand.Reason != NoCand;
3603 if (tryGreater(TryCand.ResDelta.DemandedResources,
3604 Cand.ResDelta.DemandedResources,
3605 TryCand, Cand, ResourceDemand))
3606 return TryCand.Reason != NoCand;
3607
3608 // Avoid serializing long latency dependence chains.
3609 if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
3610 return TryCand.Reason != NoCand;
3611 }
3612
3613 // Fall through to original instruction order.
3614 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
3615 TryCand.Reason = NodeOrder;
3616 return true;
3617 }
3618
3619 return false;
3620 }
3621
pickNodeFromQueue(SchedCandidate & Cand)3622 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
3623 ReadyQueue &Q = Top.Available;
3624 for (SUnit *SU : Q) {
3625 SchedCandidate TryCand(Cand.Policy);
3626 TryCand.SU = SU;
3627 TryCand.AtTop = true;
3628 TryCand.initResourceDelta(DAG, SchedModel);
3629 if (tryCandidate(Cand, TryCand)) {
3630 Cand.setBest(TryCand);
3631 LLVM_DEBUG(traceCandidate(Cand));
3632 }
3633 }
3634 }
3635
3636 /// Pick the next node to schedule.
pickNode(bool & IsTopNode)3637 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
3638 if (DAG->top() == DAG->bottom()) {
3639 assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
3640 return nullptr;
3641 }
3642 SUnit *SU;
3643 do {
3644 SU = Top.pickOnlyChoice();
3645 if (SU) {
3646 tracePick(Only1, true);
3647 } else {
3648 CandPolicy NoPolicy;
3649 SchedCandidate TopCand(NoPolicy);
3650 // Set the top-down policy based on the state of the current top zone and
3651 // the instructions outside the zone, including the bottom zone.
3652 setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
3653 pickNodeFromQueue(TopCand);
3654 assert(TopCand.Reason != NoCand && "failed to find a candidate");
3655 tracePick(TopCand);
3656 SU = TopCand.SU;
3657 }
3658 } while (SU->isScheduled);
3659
3660 IsTopNode = true;
3661 Top.removeReady(SU);
3662
3663 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3664 << *SU->getInstr());
3665 return SU;
3666 }
3667
3668 /// Called after ScheduleDAGMI has scheduled an instruction and updated
3669 /// scheduled/remaining flags in the DAG nodes.
schedNode(SUnit * SU,bool IsTopNode)3670 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3671 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3672 Top.bumpNode(SU);
3673 }
3674
createGenericSchedPostRA(MachineSchedContext * C)3675 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
3676 return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
3677 /*RemoveKillFlags=*/true);
3678 }
3679
3680 //===----------------------------------------------------------------------===//
3681 // ILP Scheduler. Currently for experimental analysis of heuristics.
3682 //===----------------------------------------------------------------------===//
3683
3684 namespace {
3685
3686 /// Order nodes by the ILP metric.
3687 struct ILPOrder {
3688 const SchedDFSResult *DFSResult = nullptr;
3689 const BitVector *ScheduledTrees = nullptr;
3690 bool MaximizeILP;
3691
ILPOrder__anon6ea857550511::ILPOrder3692 ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
3693
3694 /// Apply a less-than relation on node priority.
3695 ///
3696 /// (Return true if A comes after B in the Q.)
operator ()__anon6ea857550511::ILPOrder3697 bool operator()(const SUnit *A, const SUnit *B) const {
3698 unsigned SchedTreeA = DFSResult->getSubtreeID(A);
3699 unsigned SchedTreeB = DFSResult->getSubtreeID(B);
3700 if (SchedTreeA != SchedTreeB) {
3701 // Unscheduled trees have lower priority.
3702 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
3703 return ScheduledTrees->test(SchedTreeB);
3704
3705 // Trees with shallower connections have have lower priority.
3706 if (DFSResult->getSubtreeLevel(SchedTreeA)
3707 != DFSResult->getSubtreeLevel(SchedTreeB)) {
3708 return DFSResult->getSubtreeLevel(SchedTreeA)
3709 < DFSResult->getSubtreeLevel(SchedTreeB);
3710 }
3711 }
3712 if (MaximizeILP)
3713 return DFSResult->getILP(A) < DFSResult->getILP(B);
3714 else
3715 return DFSResult->getILP(A) > DFSResult->getILP(B);
3716 }
3717 };
3718
3719 /// Schedule based on the ILP metric.
3720 class ILPScheduler : public MachineSchedStrategy {
3721 ScheduleDAGMILive *DAG = nullptr;
3722 ILPOrder Cmp;
3723
3724 std::vector<SUnit*> ReadyQ;
3725
3726 public:
ILPScheduler(bool MaximizeILP)3727 ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
3728
initialize(ScheduleDAGMI * dag)3729 void initialize(ScheduleDAGMI *dag) override {
3730 assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
3731 DAG = static_cast<ScheduleDAGMILive*>(dag);
3732 DAG->computeDFSResult();
3733 Cmp.DFSResult = DAG->getDFSResult();
3734 Cmp.ScheduledTrees = &DAG->getScheduledTrees();
3735 ReadyQ.clear();
3736 }
3737
registerRoots()3738 void registerRoots() override {
3739 // Restore the heap in ReadyQ with the updated DFS results.
3740 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3741 }
3742
3743 /// Implement MachineSchedStrategy interface.
3744 /// -----------------------------------------
3745
3746 /// Callback to select the highest priority node from the ready Q.
pickNode(bool & IsTopNode)3747 SUnit *pickNode(bool &IsTopNode) override {
3748 if (ReadyQ.empty()) return nullptr;
3749 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3750 SUnit *SU = ReadyQ.back();
3751 ReadyQ.pop_back();
3752 IsTopNode = false;
3753 LLVM_DEBUG(dbgs() << "Pick node "
3754 << "SU(" << SU->NodeNum << ") "
3755 << " ILP: " << DAG->getDFSResult()->getILP(SU)
3756 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
3757 << " @"
3758 << DAG->getDFSResult()->getSubtreeLevel(
3759 DAG->getDFSResult()->getSubtreeID(SU))
3760 << '\n'
3761 << "Scheduling " << *SU->getInstr());
3762 return SU;
3763 }
3764
3765 /// Scheduler callback to notify that a new subtree is scheduled.
scheduleTree(unsigned SubtreeID)3766 void scheduleTree(unsigned SubtreeID) override {
3767 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3768 }
3769
3770 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
3771 /// DFSResults, and resort the priority Q.
schedNode(SUnit * SU,bool IsTopNode)3772 void schedNode(SUnit *SU, bool IsTopNode) override {
3773 assert(!IsTopNode && "SchedDFSResult needs bottom-up");
3774 }
3775
releaseTopNode(SUnit *)3776 void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
3777
releaseBottomNode(SUnit * SU)3778 void releaseBottomNode(SUnit *SU) override {
3779 ReadyQ.push_back(SU);
3780 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3781 }
3782 };
3783
3784 } // end anonymous namespace
3785
createILPMaxScheduler(MachineSchedContext * C)3786 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
3787 return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
3788 }
createILPMinScheduler(MachineSchedContext * C)3789 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
3790 return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
3791 }
3792
3793 static MachineSchedRegistry ILPMaxRegistry(
3794 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
3795 static MachineSchedRegistry ILPMinRegistry(
3796 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
3797
3798 //===----------------------------------------------------------------------===//
3799 // Machine Instruction Shuffler for Correctness Testing
3800 //===----------------------------------------------------------------------===//
3801
3802 #ifndef NDEBUG
3803 namespace {
3804
3805 /// Apply a less-than relation on the node order, which corresponds to the
3806 /// instruction order prior to scheduling. IsReverse implements greater-than.
3807 template<bool IsReverse>
3808 struct SUnitOrder {
operator ()__anon6ea857550611::SUnitOrder3809 bool operator()(SUnit *A, SUnit *B) const {
3810 if (IsReverse)
3811 return A->NodeNum > B->NodeNum;
3812 else
3813 return A->NodeNum < B->NodeNum;
3814 }
3815 };
3816
3817 /// Reorder instructions as much as possible.
3818 class InstructionShuffler : public MachineSchedStrategy {
3819 bool IsAlternating;
3820 bool IsTopDown;
3821
3822 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
3823 // gives nodes with a higher number higher priority causing the latest
3824 // instructions to be scheduled first.
3825 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
3826 TopQ;
3827
3828 // When scheduling bottom-up, use greater-than as the queue priority.
3829 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
3830 BottomQ;
3831
3832 public:
InstructionShuffler(bool alternate,bool topdown)3833 InstructionShuffler(bool alternate, bool topdown)
3834 : IsAlternating(alternate), IsTopDown(topdown) {}
3835
initialize(ScheduleDAGMI *)3836 void initialize(ScheduleDAGMI*) override {
3837 TopQ.clear();
3838 BottomQ.clear();
3839 }
3840
3841 /// Implement MachineSchedStrategy interface.
3842 /// -----------------------------------------
3843
pickNode(bool & IsTopNode)3844 SUnit *pickNode(bool &IsTopNode) override {
3845 SUnit *SU;
3846 if (IsTopDown) {
3847 do {
3848 if (TopQ.empty()) return nullptr;
3849 SU = TopQ.top();
3850 TopQ.pop();
3851 } while (SU->isScheduled);
3852 IsTopNode = true;
3853 } else {
3854 do {
3855 if (BottomQ.empty()) return nullptr;
3856 SU = BottomQ.top();
3857 BottomQ.pop();
3858 } while (SU->isScheduled);
3859 IsTopNode = false;
3860 }
3861 if (IsAlternating)
3862 IsTopDown = !IsTopDown;
3863 return SU;
3864 }
3865
schedNode(SUnit * SU,bool IsTopNode)3866 void schedNode(SUnit *SU, bool IsTopNode) override {}
3867
releaseTopNode(SUnit * SU)3868 void releaseTopNode(SUnit *SU) override {
3869 TopQ.push(SU);
3870 }
releaseBottomNode(SUnit * SU)3871 void releaseBottomNode(SUnit *SU) override {
3872 BottomQ.push(SU);
3873 }
3874 };
3875
3876 } // end anonymous namespace
3877
createInstructionShuffler(MachineSchedContext * C)3878 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
3879 bool Alternate = !ForceTopDown && !ForceBottomUp;
3880 bool TopDown = !ForceBottomUp;
3881 assert((TopDown || !ForceTopDown) &&
3882 "-misched-topdown incompatible with -misched-bottomup");
3883 return new ScheduleDAGMILive(
3884 C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
3885 }
3886
3887 static MachineSchedRegistry ShufflerRegistry(
3888 "shuffle", "Shuffle machine instructions alternating directions",
3889 createInstructionShuffler);
3890 #endif // !NDEBUG
3891
3892 //===----------------------------------------------------------------------===//
3893 // GraphWriter support for ScheduleDAGMILive.
3894 //===----------------------------------------------------------------------===//
3895
3896 #ifndef NDEBUG
3897 namespace llvm {
3898
3899 template<> struct GraphTraits<
3900 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
3901
3902 template<>
3903 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
DOTGraphTraitsllvm::DOTGraphTraits3904 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
3905
getGraphNamellvm::DOTGraphTraits3906 static std::string getGraphName(const ScheduleDAG *G) {
3907 return std::string(G->MF.getName());
3908 }
3909
renderGraphFromBottomUpllvm::DOTGraphTraits3910 static bool renderGraphFromBottomUp() {
3911 return true;
3912 }
3913
isNodeHiddenllvm::DOTGraphTraits3914 static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
3915 if (ViewMISchedCutoff == 0)
3916 return false;
3917 return (Node->Preds.size() > ViewMISchedCutoff
3918 || Node->Succs.size() > ViewMISchedCutoff);
3919 }
3920
3921 /// If you want to override the dot attributes printed for a particular
3922 /// edge, override this method.
getEdgeAttributesllvm::DOTGraphTraits3923 static std::string getEdgeAttributes(const SUnit *Node,
3924 SUnitIterator EI,
3925 const ScheduleDAG *Graph) {
3926 if (EI.isArtificialDep())
3927 return "color=cyan,style=dashed";
3928 if (EI.isCtrlDep())
3929 return "color=blue,style=dashed";
3930 return "";
3931 }
3932
getNodeLabelllvm::DOTGraphTraits3933 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
3934 std::string Str;
3935 raw_string_ostream SS(Str);
3936 const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3937 const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3938 static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3939 SS << "SU:" << SU->NodeNum;
3940 if (DFS)
3941 SS << " I:" << DFS->getNumInstrs(SU);
3942 return SS.str();
3943 }
3944
getNodeDescriptionllvm::DOTGraphTraits3945 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
3946 return G->getGraphNodeLabel(SU);
3947 }
3948
getNodeAttributesllvm::DOTGraphTraits3949 static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
3950 std::string Str("shape=Mrecord");
3951 const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3952 const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3953 static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3954 if (DFS) {
3955 Str += ",style=filled,fillcolor=\"#";
3956 Str += DOT::getColorString(DFS->getSubtreeID(N));
3957 Str += '"';
3958 }
3959 return Str;
3960 }
3961 };
3962
3963 } // end namespace llvm
3964 #endif // NDEBUG
3965
3966 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
3967 /// rendered using 'dot'.
viewGraph(const Twine & Name,const Twine & Title)3968 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
3969 #ifndef NDEBUG
3970 ViewGraph(this, Name, false, Title);
3971 #else
3972 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
3973 << "systems with Graphviz or gv!\n";
3974 #endif // NDEBUG
3975 }
3976
3977 /// Out-of-line implementation with no arguments is handy for gdb.
viewGraph()3978 void ScheduleDAGMI::viewGraph() {
3979 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
3980 }
3981