1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Software pipelining (SWP) is an instruction scheduling technique for loops 10 // that overlaps loop iterations and exploits ILP via compiler transformations. 11 // 12 // There are multiple methods for analyzing a loop and creating a schedule. 13 // An example algorithm is Swing Modulo Scheduling (implemented by the 14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant 15 // for the task of actually rewriting a loop to adhere to the schedule, which 16 // is what this file does. 17 // 18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note 19 // that we only support single-block loops, so "block" and "loop" can be used 20 // interchangably. 21 // 22 // The Cycle of an instruction defines a partial order of the instructions in 23 // the remapped loop. Instructions within a cycle must not consume the output 24 // of any instruction in the same cycle. Cycle information is assumed to have 25 // been calculated such that the processor will execute instructions in 26 // lock-step (for example in a VLIW ISA). 27 // 28 // The Stage of an instruction defines the mapping between logical loop 29 // iterations and pipelined loop iterations. An example (unrolled) pipeline 30 // may look something like: 31 // 32 // I0[0] Execute instruction I0 of iteration 0 33 // I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1 34 // I1[1], I0[2] 35 // I1[2], I0[3] 36 // 37 // In the schedule for this unrolled sequence we would say that I0 was scheduled 38 // in stage 0 and I1 in stage 1: 39 // 40 // loop: 41 // [stage 0] x = I0 42 // [stage 1] I1 x (from stage 0) 43 // 44 // And to actually generate valid code we must insert a phi: 45 // 46 // loop: 47 // x' = phi(x) 48 // x = I0 49 // I1 x' 50 // 51 // This is a simple example; the rules for how to generate correct code given 52 // an arbitrary schedule containing loop-carried values are complex. 53 // 54 // Note that these examples only mention the steady-state kernel of the 55 // generated loop; prologs and epilogs must be generated also that prime and 56 // flush the pipeline. Doing so is nontrivial. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H 61 #define LLVM_CODEGEN_MODULOSCHEDULE_H 62 63 #include "llvm/CodeGen/MachineFunction.h" 64 #include "llvm/CodeGen/MachineLoopUtils.h" 65 #include "llvm/CodeGen/TargetInstrInfo.h" 66 #include "llvm/CodeGen/TargetSubtargetInfo.h" 67 #include <deque> 68 #include <vector> 69 70 namespace llvm { 71 class MachineBasicBlock; 72 class MachineLoop; 73 class MachineRegisterInfo; 74 class MachineInstr; 75 class LiveIntervals; 76 77 /// Represents a schedule for a single-block loop. For every instruction we 78 /// maintain a Cycle and Stage. 79 class ModuloSchedule { 80 private: 81 /// The block containing the loop instructions. 82 MachineLoop *Loop; 83 84 /// The instructions to be generated, in total order. Cycle provides a partial 85 /// order; the total order within cycles has been decided by the schedule 86 /// producer. 87 std::vector<MachineInstr *> ScheduledInstrs; 88 89 /// The cycle for each instruction. 90 DenseMap<MachineInstr *, int> Cycle; 91 92 /// The stage for each instruction. 93 DenseMap<MachineInstr *, int> Stage; 94 95 /// The number of stages in this schedule (Max(Stage) + 1). 96 int NumStages; 97 98 public: 99 /// Create a new ModuloSchedule. 100 /// \arg ScheduledInstrs The new loop instructions, in total resequenced 101 /// order. 102 /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does 103 /// not need to start at zero. ScheduledInstrs must be partially ordered by 104 /// Cycle. 105 /// \arg Stage Stage index for all instructions in ScheduleInstrs. 106 ModuloSchedule(MachineFunction &MF, MachineLoop *Loop, 107 std::vector<MachineInstr *> ScheduledInstrs, 108 DenseMap<MachineInstr *, int> Cycle, 109 DenseMap<MachineInstr *, int> Stage) 110 : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)), 111 Stage(std::move(Stage)) { 112 NumStages = 0; 113 for (auto &KV : this->Stage) 114 NumStages = std::max(NumStages, KV.second); 115 ++NumStages; 116 } 117 118 /// Return the single-block loop being scheduled. 119 MachineLoop *getLoop() const { return Loop; } 120 121 /// Return the number of stages contained in this schedule, which is the 122 /// largest stage index + 1. 123 int getNumStages() const { return NumStages; } 124 125 /// Return the first cycle in the schedule, which is the cycle index of the 126 /// first instruction. 127 int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; } 128 129 /// Return the final cycle in the schedule, which is the cycle index of the 130 /// last instruction. 131 int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; } 132 133 /// Return the stage that MI is scheduled in, or -1. 134 int getStage(MachineInstr *MI) { 135 auto I = Stage.find(MI); 136 return I == Stage.end() ? -1 : I->second; 137 } 138 139 /// Return the cycle that MI is scheduled at, or -1. 140 int getCycle(MachineInstr *MI) { 141 auto I = Cycle.find(MI); 142 return I == Cycle.end() ? -1 : I->second; 143 } 144 145 /// Set the stage of a newly created instruction. 146 void setStage(MachineInstr *MI, int MIStage) { 147 assert(Stage.count(MI) == 0); 148 Stage[MI] = MIStage; 149 } 150 151 /// Return the rescheduled instructions in order. 152 ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; } 153 154 void dump() { print(dbgs()); } 155 void print(raw_ostream &OS); 156 }; 157 158 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, 159 /// rewriting the old loop and inserting prologs and epilogs as required. 160 class ModuloScheduleExpander { 161 public: 162 using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>; 163 164 private: 165 using ValueMapTy = DenseMap<unsigned, unsigned>; 166 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; 167 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; 168 169 ModuloSchedule &Schedule; 170 MachineFunction &MF; 171 const TargetSubtargetInfo &ST; 172 MachineRegisterInfo &MRI; 173 const TargetInstrInfo *TII = nullptr; 174 LiveIntervals &LIS; 175 176 MachineBasicBlock *BB = nullptr; 177 MachineBasicBlock *Preheader = nullptr; 178 MachineBasicBlock *NewKernel = nullptr; 179 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; 180 181 /// Map for each register and the max difference between its uses and def. 182 /// The first element in the pair is the max difference in stages. The 183 /// second is true if the register defines a Phi value and loop value is 184 /// scheduled before the Phi. 185 std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff; 186 187 /// Instructions to change when emitting the final schedule. 188 InstrChangesTy InstrChanges; 189 190 void generatePipelinedLoop(); 191 void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, 192 ValueMapTy *VRMap, MBBVectorTy &PrologBBs); 193 void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB, 194 MachineBasicBlock *OrigBB, ValueMapTy *VRMap, 195 ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs, 196 MBBVectorTy &PrologBBs); 197 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, 198 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, 199 ValueMapTy *VRMap, InstrMapTy &InstrMap, 200 unsigned LastStageNum, unsigned CurStageNum, 201 bool IsLast); 202 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, 203 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, 204 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, 205 InstrMapTy &InstrMap, unsigned LastStageNum, 206 unsigned CurStageNum, bool IsLast); 207 void removeDeadInstructions(MachineBasicBlock *KernelBB, 208 MBBVectorTy &EpilogBBs); 209 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs); 210 void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs, 211 MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, 212 ValueMapTy *VRMap); 213 bool computeDelta(MachineInstr &MI, unsigned &Delta); 214 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, 215 unsigned Num); 216 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum, 217 unsigned InstStageNum); 218 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum, 219 unsigned InstStageNum); 220 void updateInstruction(MachineInstr *NewMI, bool LastDef, 221 unsigned CurStageNum, unsigned InstrStageNum, 222 ValueMapTy *VRMap); 223 MachineInstr *findDefInLoop(unsigned Reg); 224 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal, 225 unsigned LoopStage, ValueMapTy *VRMap, 226 MachineBasicBlock *BB); 227 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum, 228 ValueMapTy *VRMap, InstrMapTy &InstrMap); 229 void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap, 230 unsigned CurStageNum, unsigned PhiNum, 231 MachineInstr *Phi, unsigned OldReg, 232 unsigned NewReg, unsigned PrevReg = 0); 233 bool isLoopCarried(MachineInstr &Phi); 234 235 /// Return the max. number of stages/iterations that can occur between a 236 /// register definition and its uses. 237 unsigned getStagesForReg(int Reg, unsigned CurStage) { 238 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; 239 if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 && 240 Stages.second) 241 return 1; 242 return Stages.first; 243 } 244 245 /// The number of stages for a Phi is a little different than other 246 /// instructions. The minimum value computed in RegToStageDiff is 1 247 /// because we assume the Phi is needed for at least 1 iteration. 248 /// This is not the case if the loop value is scheduled prior to the 249 /// Phi in the same stage. This function returns the number of stages 250 /// or iterations needed between the Phi definition and any uses. 251 unsigned getStagesForPhi(int Reg) { 252 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; 253 if (Stages.second) 254 return Stages.first; 255 return Stages.first - 1; 256 } 257 258 public: 259 /// Create a new ModuloScheduleExpander. 260 /// \arg InstrChanges Modifications to make to instructions with memory 261 /// operands. 262 /// FIXME: InstrChanges is opaque and is an implementation detail of an 263 /// optimization in MachinePipeliner that crosses abstraction boundaries. 264 ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, 265 LiveIntervals &LIS, InstrChangesTy InstrChanges) 266 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), 267 TII(ST.getInstrInfo()), LIS(LIS), 268 InstrChanges(std::move(InstrChanges)) {} 269 270 /// Performs the actual expansion. 271 void expand(); 272 /// Performs final cleanup after expansion. 273 void cleanup(); 274 275 /// Returns the newly rewritten kernel block, or nullptr if this was 276 /// optimized away. 277 MachineBasicBlock *getRewrittenKernel() { return NewKernel; } 278 }; 279 280 /// A reimplementation of ModuloScheduleExpander. It works by generating a 281 /// standalone kernel loop and peeling out the prologs and epilogs. 282 class PeelingModuloScheduleExpander { 283 public: 284 PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, 285 LiveIntervals *LIS) 286 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), 287 TII(ST.getInstrInfo()), LIS(LIS) {} 288 289 void expand(); 290 291 /// Runs ModuloScheduleExpander and treats it as a golden input to validate 292 /// aspects of the code generated by PeelingModuloScheduleExpander. 293 void validateAgainstModuloScheduleExpander(); 294 295 protected: 296 ModuloSchedule &Schedule; 297 MachineFunction &MF; 298 const TargetSubtargetInfo &ST; 299 MachineRegisterInfo &MRI; 300 const TargetInstrInfo *TII = nullptr; 301 LiveIntervals *LIS = nullptr; 302 303 /// The original loop block that gets rewritten in-place. 304 MachineBasicBlock *BB = nullptr; 305 /// The original loop preheader. 306 MachineBasicBlock *Preheader = nullptr; 307 /// All prolog and epilog blocks. 308 SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs; 309 /// For every block, the stages that are produced. 310 DenseMap<MachineBasicBlock *, BitVector> LiveStages; 311 /// For every block, the stages that are available. A stage can be available 312 /// but not produced (in the epilog) or produced but not available (in the 313 /// prolog). 314 DenseMap<MachineBasicBlock *, BitVector> AvailableStages; 315 /// When peeling the epilogue keep track of the distance between the phi 316 /// nodes and the kernel. 317 DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration; 318 319 /// CanonicalMIs and BlockMIs form a bidirectional map between any of the 320 /// loop kernel clones. 321 DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs; 322 DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *> 323 BlockMIs; 324 325 /// State passed from peelKernel to peelPrologAndEpilogs(). 326 std::deque<MachineBasicBlock *> PeeledFront, PeeledBack; 327 /// Illegal phis that need to be deleted once we re-link stages. 328 SmallVector<MachineInstr *, 4> IllegalPhisToDelete; 329 330 /// Converts BB from the original loop body to the rewritten, pipelined 331 /// steady-state. 332 void rewriteKernel(); 333 334 /// Peels one iteration of the rewritten kernel (BB) in the specified 335 /// direction. 336 MachineBasicBlock *peelKernel(LoopPeelDirection LPD); 337 // Delete instructions whose stage is less than MinStage in the given basic 338 // block. 339 void filterInstructions(MachineBasicBlock *MB, int MinStage); 340 // Move instructions of the given stage from sourceBB to DestBB. Remap the phi 341 // instructions to keep a valid IR. 342 void moveStageBetweenBlocks(MachineBasicBlock *DestBB, 343 MachineBasicBlock *SourceBB, unsigned Stage); 344 /// Peel the kernel forwards and backwards to produce prologs and epilogs, 345 /// and stitch them together. 346 void peelPrologAndEpilogs(); 347 /// All prolog and epilog blocks are clones of the kernel, so any produced 348 /// register in one block has an corollary in all other blocks. 349 Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); 350 /// Change all users of MI, if MI is predicated out 351 /// (LiveStages[MI->getParent()] == false). 352 void rewriteUsesOf(MachineInstr *MI); 353 /// Insert branches between prologs, kernel and epilogs. 354 void fixupBranches(); 355 /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block 356 /// to a block dominated by all prologs and epilogs. This allows us to treat 357 /// the loop exiting block as any other kernel clone. 358 MachineBasicBlock *CreateLCSSAExitingBlock(); 359 /// Helper to get the stage of an instruction in the schedule. 360 unsigned getStage(MachineInstr *MI) { 361 if (CanonicalMIs.count(MI)) 362 MI = CanonicalMIs[MI]; 363 return Schedule.getStage(MI); 364 } 365 /// Helper function to find the right canonical register for a phi instruction 366 /// coming from a peeled out prologue. 367 Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi); 368 /// Target loop info before kernel peeling. 369 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; 370 }; 371 372 /// Expander that simply annotates each scheduled instruction with a post-instr 373 /// symbol that can be consumed by the ModuloScheduleTest pass. 374 /// 375 /// The post-instr symbol is a way of annotating an instruction that can be 376 /// roundtripped in MIR. The syntax is: 377 /// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5> 378 class ModuloScheduleTestAnnotater { 379 MachineFunction &MF; 380 ModuloSchedule &S; 381 382 public: 383 ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S) 384 : MF(MF), S(S) {} 385 386 /// Performs the annotation. 387 void annotate(); 388 }; 389 390 } // end namespace llvm 391 392 #endif // LLVM_CODEGEN_MODULOSCHEDULE_H 393