1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Software pipelining (SWP) is an instruction scheduling technique for loops
10 // that overlaps loop iterations and exploits ILP via compiler transformations.
11 //
12 // There are multiple methods for analyzing a loop and creating a schedule.
13 // An example algorithm is Swing Modulo Scheduling (implemented by the
14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
15 // for the task of actually rewriting a loop to adhere to the schedule, which
16 // is what this file does.
17 //
18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
19 // that we only support single-block loops, so "block" and "loop" can be used
20 // interchangably.
21 //
22 // The Cycle of an instruction defines a partial order of the instructions in
23 // the remapped loop. Instructions within a cycle must not consume the output
24 // of any instruction in the same cycle. Cycle information is assumed to have
25 // been calculated such that the processor will execute instructions in
26 // lock-step (for example in a VLIW ISA).
27 //
28 // The Stage of an instruction defines the mapping between logical loop
29 // iterations and pipelined loop iterations. An example (unrolled) pipeline
30 // may look something like:
31 //
32 //  I0[0]                      Execute instruction I0 of iteration 0
33 //  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
34 //         I1[1], I0[2]
35 //                I1[2], I0[3]
36 //
37 // In the schedule for this unrolled sequence we would say that I0 was scheduled
38 // in stage 0 and I1 in stage 1:
39 //
40 //  loop:
41 //    [stage 0] x = I0
42 //    [stage 1] I1 x (from stage 0)
43 //
44 // And to actually generate valid code we must insert a phi:
45 //
46 //  loop:
47 //    x' = phi(x)
48 //    x = I0
49 //    I1 x'
50 //
51 // This is a simple example; the rules for how to generate correct code given
52 // an arbitrary schedule containing loop-carried values are complex.
53 //
54 // Note that these examples only mention the steady-state kernel of the
55 // generated loop; prologs and epilogs must be generated also that prime and
56 // flush the pipeline. Doing so is nontrivial.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
61 #define LLVM_CODEGEN_MODULOSCHEDULE_H
62 
63 #include "llvm/CodeGen/MachineFunction.h"
64 #include "llvm/CodeGen/MachineLoopUtils.h"
65 #include "llvm/CodeGen/TargetInstrInfo.h"
66 #include "llvm/CodeGen/TargetSubtargetInfo.h"
67 #include <deque>
68 #include <map>
69 #include <vector>
70 
71 namespace llvm {
72 class MachineBasicBlock;
73 class MachineLoop;
74 class MachineRegisterInfo;
75 class MachineInstr;
76 class LiveIntervals;
77 
78 /// Represents a schedule for a single-block loop. For every instruction we
79 /// maintain a Cycle and Stage.
80 class ModuloSchedule {
81 private:
82   /// The block containing the loop instructions.
83   MachineLoop *Loop;
84 
85   /// The instructions to be generated, in total order. Cycle provides a partial
86   /// order; the total order within cycles has been decided by the schedule
87   /// producer.
88   std::vector<MachineInstr *> ScheduledInstrs;
89 
90   /// The cycle for each instruction.
91   DenseMap<MachineInstr *, int> Cycle;
92 
93   /// The stage for each instruction.
94   DenseMap<MachineInstr *, int> Stage;
95 
96   /// The number of stages in this schedule (Max(Stage) + 1).
97   int NumStages;
98 
99 public:
100   /// Create a new ModuloSchedule.
101   /// \arg ScheduledInstrs The new loop instructions, in total resequenced
102   ///    order.
103   /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
104   ///    not need to start at zero. ScheduledInstrs must be partially ordered by
105   ///    Cycle.
106   /// \arg Stage Stage index for all instructions in ScheduleInstrs.
107   ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
108                  std::vector<MachineInstr *> ScheduledInstrs,
109                  DenseMap<MachineInstr *, int> Cycle,
110                  DenseMap<MachineInstr *, int> Stage)
111       : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
112         Stage(std::move(Stage)) {
113     NumStages = 0;
114     for (auto &KV : this->Stage)
115       NumStages = std::max(NumStages, KV.second);
116     ++NumStages;
117   }
118 
119   /// Return the single-block loop being scheduled.
120   MachineLoop *getLoop() const { return Loop; }
121 
122   /// Return the number of stages contained in this schedule, which is the
123   /// largest stage index + 1.
124   int getNumStages() const { return NumStages; }
125 
126   /// Return the first cycle in the schedule, which is the cycle index of the
127   /// first instruction.
128   int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
129 
130   /// Return the final cycle in the schedule, which is the cycle index of the
131   /// last instruction.
132   int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
133 
134   /// Return the stage that MI is scheduled in, or -1.
135   int getStage(MachineInstr *MI) {
136     auto I = Stage.find(MI);
137     return I == Stage.end() ? -1 : I->second;
138   }
139 
140   /// Return the cycle that MI is scheduled at, or -1.
141   int getCycle(MachineInstr *MI) {
142     auto I = Cycle.find(MI);
143     return I == Cycle.end() ? -1 : I->second;
144   }
145 
146   /// Set the stage of a newly created instruction.
147   void setStage(MachineInstr *MI, int MIStage) {
148     assert(Stage.count(MI) == 0);
149     Stage[MI] = MIStage;
150   }
151 
152   /// Return the rescheduled instructions in order.
153   ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
154 
155   void dump() { print(dbgs()); }
156   void print(raw_ostream &OS);
157 };
158 
159 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
160 /// rewriting the old loop and inserting prologs and epilogs as required.
161 class ModuloScheduleExpander {
162 public:
163   using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
164 
165 private:
166   using ValueMapTy = DenseMap<unsigned, unsigned>;
167   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
168   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
169 
170   ModuloSchedule &Schedule;
171   MachineFunction &MF;
172   const TargetSubtargetInfo &ST;
173   MachineRegisterInfo &MRI;
174   const TargetInstrInfo *TII = nullptr;
175   LiveIntervals &LIS;
176 
177   MachineBasicBlock *BB = nullptr;
178   MachineBasicBlock *Preheader = nullptr;
179   MachineBasicBlock *NewKernel = nullptr;
180   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
181 
182   /// Map for each register and the max difference between its uses and def.
183   /// The first element in the pair is the max difference in stages. The
184   /// second is true if the register defines a Phi value and loop value is
185   /// scheduled before the Phi.
186   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
187 
188   /// Instructions to change when emitting the final schedule.
189   InstrChangesTy InstrChanges;
190 
191   void generatePipelinedLoop();
192   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
193                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
194   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
195                       MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
196                       ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
197                       MBBVectorTy &PrologBBs);
198   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
199                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
200                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
201                             unsigned LastStageNum, unsigned CurStageNum,
202                             bool IsLast);
203   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
204                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
205                     ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
206                     InstrMapTy &InstrMap, unsigned LastStageNum,
207                     unsigned CurStageNum, bool IsLast);
208   void removeDeadInstructions(MachineBasicBlock *KernelBB,
209                               MBBVectorTy &EpilogBBs);
210   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
211   void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
212                    MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
213                    ValueMapTy *VRMap);
214   bool computeDelta(MachineInstr &MI, unsigned &Delta);
215   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
216                          unsigned Num);
217   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
218                            unsigned InstStageNum);
219   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
220                                     unsigned InstStageNum);
221   void updateInstruction(MachineInstr *NewMI, bool LastDef,
222                          unsigned CurStageNum, unsigned InstrStageNum,
223                          ValueMapTy *VRMap);
224   MachineInstr *findDefInLoop(unsigned Reg);
225   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
226                          unsigned LoopStage, ValueMapTy *VRMap,
227                          MachineBasicBlock *BB);
228   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
229                         ValueMapTy *VRMap, InstrMapTy &InstrMap);
230   void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
231                              unsigned CurStageNum, unsigned PhiNum,
232                              MachineInstr *Phi, unsigned OldReg,
233                              unsigned NewReg, unsigned PrevReg = 0);
234   bool isLoopCarried(MachineInstr &Phi);
235 
236   /// Return the max. number of stages/iterations that can occur between a
237   /// register definition and its uses.
238   unsigned getStagesForReg(int Reg, unsigned CurStage) {
239     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
240     if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
241         Stages.second)
242       return 1;
243     return Stages.first;
244   }
245 
246   /// The number of stages for a Phi is a little different than other
247   /// instructions. The minimum value computed in RegToStageDiff is 1
248   /// because we assume the Phi is needed for at least 1 iteration.
249   /// This is not the case if the loop value is scheduled prior to the
250   /// Phi in the same stage.  This function returns the number of stages
251   /// or iterations needed between the Phi definition and any uses.
252   unsigned getStagesForPhi(int Reg) {
253     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
254     if (Stages.second)
255       return Stages.first;
256     return Stages.first - 1;
257   }
258 
259 public:
260   /// Create a new ModuloScheduleExpander.
261   /// \arg InstrChanges Modifications to make to instructions with memory
262   ///   operands.
263   /// FIXME: InstrChanges is opaque and is an implementation detail of an
264   ///   optimization in MachinePipeliner that crosses abstraction boundaries.
265   ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
266                          LiveIntervals &LIS, InstrChangesTy InstrChanges)
267       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
268         TII(ST.getInstrInfo()), LIS(LIS),
269         InstrChanges(std::move(InstrChanges)) {}
270 
271   /// Performs the actual expansion.
272   void expand();
273   /// Performs final cleanup after expansion.
274   void cleanup();
275 
276   /// Returns the newly rewritten kernel block, or nullptr if this was
277   /// optimized away.
278   MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
279 };
280 
281 /// A reimplementation of ModuloScheduleExpander. It works by generating a
282 /// standalone kernel loop and peeling out the prologs and epilogs.
283 class PeelingModuloScheduleExpander {
284 public:
285   PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
286                                 LiveIntervals *LIS)
287       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
288         TII(ST.getInstrInfo()), LIS(LIS) {}
289 
290   void expand();
291 
292   /// Runs ModuloScheduleExpander and treats it as a golden input to validate
293   /// aspects of the code generated by PeelingModuloScheduleExpander.
294   void validateAgainstModuloScheduleExpander();
295 
296 protected:
297   ModuloSchedule &Schedule;
298   MachineFunction &MF;
299   const TargetSubtargetInfo &ST;
300   MachineRegisterInfo &MRI;
301   const TargetInstrInfo *TII = nullptr;
302   LiveIntervals *LIS = nullptr;
303 
304   /// The original loop block that gets rewritten in-place.
305   MachineBasicBlock *BB = nullptr;
306   /// The original loop preheader.
307   MachineBasicBlock *Preheader = nullptr;
308   /// All prolog and epilog blocks.
309   SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
310   /// For every block, the stages that are produced.
311   DenseMap<MachineBasicBlock *, BitVector> LiveStages;
312   /// For every block, the stages that are available. A stage can be available
313   /// but not produced (in the epilog) or produced but not available (in the
314   /// prolog).
315   DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
316   /// When peeling the epilogue keep track of the distance between the phi
317   /// nodes and the kernel.
318   DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
319 
320   /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
321   /// loop kernel clones.
322   DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
323   DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
324       BlockMIs;
325 
326   /// State passed from peelKernel to peelPrologAndEpilogs().
327   std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
328   /// Illegal phis that need to be deleted once we re-link stages.
329   SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
330 
331   /// Converts BB from the original loop body to the rewritten, pipelined
332   /// steady-state.
333   void rewriteKernel();
334 
335   /// Peels one iteration of the rewritten kernel (BB) in the specified
336   /// direction.
337   MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
338   // Delete instructions whose stage is less than MinStage in the given basic
339   // block.
340   void filterInstructions(MachineBasicBlock *MB, int MinStage);
341   // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
342   // instructions to keep a valid IR.
343   void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
344                               MachineBasicBlock *SourceBB, unsigned Stage);
345   /// Peel the kernel forwards and backwards to produce prologs and epilogs,
346   /// and stitch them together.
347   void peelPrologAndEpilogs();
348   /// All prolog and epilog blocks are clones of the kernel, so any produced
349   /// register in one block has an corollary in all other blocks.
350   Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
351   /// Change all users of MI, if MI is predicated out
352   /// (LiveStages[MI->getParent()] == false).
353   void rewriteUsesOf(MachineInstr *MI);
354   /// Insert branches between prologs, kernel and epilogs.
355   void fixupBranches();
356   /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
357   /// to a block dominated by all prologs and epilogs. This allows us to treat
358   /// the loop exiting block as any other kernel clone.
359   MachineBasicBlock *CreateLCSSAExitingBlock();
360   /// Helper to get the stage of an instruction in the schedule.
361   unsigned getStage(MachineInstr *MI) {
362     if (CanonicalMIs.count(MI))
363       MI = CanonicalMIs[MI];
364     return Schedule.getStage(MI);
365   }
366   /// Helper function to find the right canonical register for a phi instruction
367   /// coming from a peeled out prologue.
368   Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
369   /// Target loop info before kernel peeling.
370   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
371 };
372 
373 /// Expander that simply annotates each scheduled instruction with a post-instr
374 /// symbol that can be consumed by the ModuloScheduleTest pass.
375 ///
376 /// The post-instr symbol is a way of annotating an instruction that can be
377 /// roundtripped in MIR. The syntax is:
378 ///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
379 class ModuloScheduleTestAnnotater {
380   MachineFunction &MF;
381   ModuloSchedule &S;
382 
383 public:
384   ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
385       : MF(MF), S(S) {}
386 
387   /// Performs the annotation.
388   void annotate();
389 };
390 
391 } // end namespace llvm
392 
393 #endif // LLVM_CODEGEN_MODULOSCHEDULE_H
394