1 //===------------------------- LSUnit.h --------------------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// A Load/Store unit class that models load/store queues and that implements 11 /// a simple weak memory consistency model. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H 16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H 17 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/MC/MCSchedule.h" 21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h" 22 #include "llvm/MCA/Instruction.h" 23 24 namespace llvm { 25 namespace mca { 26 27 /// A node of a memory dependency graph. A MemoryGroup describes a set of 28 /// instructions with same memory dependencies. 29 /// 30 /// By construction, instructions of a MemoryGroup don't depend on each other. 31 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups. 32 /// A Memory group identifier is then stored as a "token" in field 33 /// Instruction::LSUTokenID of each dispatched instructions. That token is used 34 /// internally by the LSUnit to track memory dependencies. 35 class MemoryGroup { 36 unsigned NumPredecessors; 37 unsigned NumExecutingPredecessors; 38 unsigned NumExecutedPredecessors; 39 40 unsigned NumInstructions; 41 unsigned NumExecuting; 42 unsigned NumExecuted; 43 // Successors that are in a order dependency with this group. 44 SmallVector<MemoryGroup *, 4> OrderSucc; 45 // Successors that are in a data dependency with this group. 46 SmallVector<MemoryGroup *, 4> DataSucc; 47 48 CriticalDependency CriticalPredecessor; 49 InstRef CriticalMemoryInstruction; 50 51 MemoryGroup(const MemoryGroup &) = delete; 52 MemoryGroup &operator=(const MemoryGroup &) = delete; 53 54 public: MemoryGroup()55 MemoryGroup() 56 : NumPredecessors(0), NumExecutingPredecessors(0), 57 NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0), 58 NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {} 59 MemoryGroup(MemoryGroup &&) = default; 60 getNumSuccessors()61 size_t getNumSuccessors() const { 62 return OrderSucc.size() + DataSucc.size(); 63 } getNumPredecessors()64 unsigned getNumPredecessors() const { return NumPredecessors; } getNumExecutingPredecessors()65 unsigned getNumExecutingPredecessors() const { 66 return NumExecutingPredecessors; 67 } getNumExecutedPredecessors()68 unsigned getNumExecutedPredecessors() const { 69 return NumExecutedPredecessors; 70 } getNumInstructions()71 unsigned getNumInstructions() const { return NumInstructions; } getNumExecuting()72 unsigned getNumExecuting() const { return NumExecuting; } getNumExecuted()73 unsigned getNumExecuted() const { return NumExecuted; } 74 getCriticalMemoryInstruction()75 const InstRef &getCriticalMemoryInstruction() const { 76 return CriticalMemoryInstruction; 77 } getCriticalPredecessor()78 const CriticalDependency &getCriticalPredecessor() const { 79 return CriticalPredecessor; 80 } 81 addSuccessor(MemoryGroup * Group,bool IsDataDependent)82 void addSuccessor(MemoryGroup *Group, bool IsDataDependent) { 83 // Do not need to add a dependency if there is no data 84 // dependency and all instructions from this group have been 85 // issued already. 86 if (!IsDataDependent && isExecuting()) 87 return; 88 89 Group->NumPredecessors++; 90 assert(!isExecuted() && "Should have been removed!"); 91 if (isExecuting()) 92 Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent); 93 94 if (IsDataDependent) 95 DataSucc.emplace_back(Group); 96 else 97 OrderSucc.emplace_back(Group); 98 } 99 isWaiting()100 bool isWaiting() const { 101 return NumPredecessors > 102 (NumExecutingPredecessors + NumExecutedPredecessors); 103 } isPending()104 bool isPending() const { 105 return NumExecutingPredecessors && 106 ((NumExecutedPredecessors + NumExecutingPredecessors) == 107 NumPredecessors); 108 } isReady()109 bool isReady() const { return NumExecutedPredecessors == NumPredecessors; } isExecuting()110 bool isExecuting() const { 111 return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted)); 112 } isExecuted()113 bool isExecuted() const { return NumInstructions == NumExecuted; } 114 onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)115 void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) { 116 assert(!isReady() && "Unexpected group-start event!"); 117 NumExecutingPredecessors++; 118 119 if (!ShouldUpdateCriticalDep) 120 return; 121 122 unsigned Cycles = IR.getInstruction()->getCyclesLeft(); 123 if (CriticalPredecessor.Cycles < Cycles) { 124 CriticalPredecessor.IID = IR.getSourceIndex(); 125 CriticalPredecessor.Cycles = Cycles; 126 } 127 } 128 onGroupExecuted()129 void onGroupExecuted() { 130 assert(!isReady() && "Inconsistent state found!"); 131 NumExecutingPredecessors--; 132 NumExecutedPredecessors++; 133 } 134 onInstructionIssued(const InstRef & IR)135 void onInstructionIssued(const InstRef &IR) { 136 assert(!isExecuting() && "Invalid internal state!"); 137 ++NumExecuting; 138 139 // update the CriticalMemDep. 140 const Instruction &IS = *IR.getInstruction(); 141 if ((bool)CriticalMemoryInstruction) { 142 const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction(); 143 if (OtherIS.getCyclesLeft() < IS.getCyclesLeft()) 144 CriticalMemoryInstruction = IR; 145 } else { 146 CriticalMemoryInstruction = IR; 147 } 148 149 if (!isExecuting()) 150 return; 151 152 // Notify successors that this group started execution. 153 for (MemoryGroup *MG : OrderSucc) { 154 MG->onGroupIssued(CriticalMemoryInstruction, false); 155 // Release the order dependency with this group. 156 MG->onGroupExecuted(); 157 } 158 159 for (MemoryGroup *MG : DataSucc) 160 MG->onGroupIssued(CriticalMemoryInstruction, true); 161 } 162 onInstructionExecuted(const InstRef & IR)163 void onInstructionExecuted(const InstRef &IR) { 164 assert(isReady() && !isExecuted() && "Invalid internal state!"); 165 --NumExecuting; 166 ++NumExecuted; 167 168 if (CriticalMemoryInstruction && 169 CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) { 170 CriticalMemoryInstruction.invalidate(); 171 } 172 173 if (!isExecuted()) 174 return; 175 176 // Notify data dependent successors that this group has finished execution. 177 for (MemoryGroup *MG : DataSucc) 178 MG->onGroupExecuted(); 179 } 180 addInstruction()181 void addInstruction() { 182 assert(!getNumSuccessors() && "Cannot add instructions to this group!"); 183 ++NumInstructions; 184 } 185 cycleEvent()186 void cycleEvent() { 187 if (isWaiting() && CriticalPredecessor.Cycles) 188 CriticalPredecessor.Cycles--; 189 } 190 }; 191 192 /// Abstract base interface for LS (load/store) units in llvm-mca. 193 class LSUnitBase : public HardwareUnit { 194 /// Load queue size. 195 /// 196 /// A value of zero for this field means that the load queue is unbounded. 197 /// Processor models can declare the size of a load queue via tablegen (see 198 /// the definition of tablegen class LoadQueue in 199 /// llvm/Target/TargetSchedule.td). 200 unsigned LQSize; 201 202 /// Load queue size. 203 /// 204 /// A value of zero for this field means that the store queue is unbounded. 205 /// Processor models can declare the size of a store queue via tablegen (see 206 /// the definition of tablegen class StoreQueue in 207 /// llvm/Target/TargetSchedule.td). 208 unsigned SQSize; 209 210 unsigned UsedLQEntries; 211 unsigned UsedSQEntries; 212 213 /// True if loads don't alias with stores. 214 /// 215 /// By default, the LS unit assumes that loads and stores don't alias with 216 /// eachother. If this field is set to false, then loads are always assumed to 217 /// alias with stores. 218 const bool NoAlias; 219 220 /// Used to map group identifiers to MemoryGroups. 221 DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups; 222 unsigned NextGroupID; 223 224 public: 225 LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize, 226 unsigned StoreQueueSize, bool AssumeNoAlias); 227 228 virtual ~LSUnitBase(); 229 230 /// Returns the total number of entries in the load queue. getLoadQueueSize()231 unsigned getLoadQueueSize() const { return LQSize; } 232 233 /// Returns the total number of entries in the store queue. getStoreQueueSize()234 unsigned getStoreQueueSize() const { return SQSize; } 235 getUsedLQEntries()236 unsigned getUsedLQEntries() const { return UsedLQEntries; } getUsedSQEntries()237 unsigned getUsedSQEntries() const { return UsedSQEntries; } acquireLQSlot()238 void acquireLQSlot() { ++UsedLQEntries; } acquireSQSlot()239 void acquireSQSlot() { ++UsedSQEntries; } releaseLQSlot()240 void releaseLQSlot() { --UsedLQEntries; } releaseSQSlot()241 void releaseSQSlot() { --UsedSQEntries; } 242 assumeNoAlias()243 bool assumeNoAlias() const { return NoAlias; } 244 245 enum Status { 246 LSU_AVAILABLE = 0, 247 LSU_LQUEUE_FULL, // Load Queue unavailable 248 LSU_SQUEUE_FULL // Store Queue unavailable 249 }; 250 251 /// This method checks the availability of the load/store buffers. 252 /// 253 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 254 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is 255 /// not a memory operation. 256 virtual Status isAvailable(const InstRef &IR) const = 0; 257 258 /// Allocates LS resources for instruction IR. 259 /// 260 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 261 /// with a LSUnitBase::Status value of LSU_AVAILABLE. 262 /// Returns the GroupID associated with this instruction. That value will be 263 /// used to set the LSUTokenID field in class Instruction. 264 virtual unsigned dispatch(const InstRef &IR) = 0; 265 isSQEmpty()266 bool isSQEmpty() const { return !UsedSQEntries; } isLQEmpty()267 bool isLQEmpty() const { return !UsedLQEntries; } isSQFull()268 bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; } isLQFull()269 bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; } 270 isValidGroupID(unsigned Index)271 bool isValidGroupID(unsigned Index) const { 272 return Index && (Groups.find(Index) != Groups.end()); 273 } 274 275 /// Check if a peviously dispatched instruction IR is now ready for execution. isReady(const InstRef & IR)276 bool isReady(const InstRef &IR) const { 277 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 278 const MemoryGroup &Group = getGroup(GroupID); 279 return Group.isReady(); 280 } 281 282 /// Check if instruction IR only depends on memory instructions that are 283 /// currently executing. isPending(const InstRef & IR)284 bool isPending(const InstRef &IR) const { 285 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 286 const MemoryGroup &Group = getGroup(GroupID); 287 return Group.isPending(); 288 } 289 290 /// Check if instruction IR is still waiting on memory operations, and the 291 /// wait time is still unknown. isWaiting(const InstRef & IR)292 bool isWaiting(const InstRef &IR) const { 293 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 294 const MemoryGroup &Group = getGroup(GroupID); 295 return Group.isWaiting(); 296 } 297 hasDependentUsers(const InstRef & IR)298 bool hasDependentUsers(const InstRef &IR) const { 299 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 300 const MemoryGroup &Group = getGroup(GroupID); 301 return !Group.isExecuted() && Group.getNumSuccessors(); 302 } 303 getGroup(unsigned Index)304 const MemoryGroup &getGroup(unsigned Index) const { 305 assert(isValidGroupID(Index) && "Group doesn't exist!"); 306 return *Groups.find(Index)->second; 307 } 308 getGroup(unsigned Index)309 MemoryGroup &getGroup(unsigned Index) { 310 assert(isValidGroupID(Index) && "Group doesn't exist!"); 311 return *Groups.find(Index)->second; 312 } 313 createMemoryGroup()314 unsigned createMemoryGroup() { 315 Groups.insert( 316 std::make_pair(NextGroupID, std::make_unique<MemoryGroup>())); 317 return NextGroupID++; 318 } 319 320 virtual void onInstructionExecuted(const InstRef &IR); 321 322 // Loads are tracked by the LDQ (load queue) from dispatch until completion. 323 // Stores are tracked by the STQ (store queue) from dispatch until commitment. 324 // By default we conservatively assume that the LDQ receives a load at 325 // dispatch. Loads leave the LDQ at retirement stage. 326 virtual void onInstructionRetired(const InstRef &IR); 327 onInstructionIssued(const InstRef & IR)328 virtual void onInstructionIssued(const InstRef &IR) { 329 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 330 Groups[GroupID]->onInstructionIssued(IR); 331 } 332 333 virtual void cycleEvent(); 334 335 #ifndef NDEBUG 336 void dump() const; 337 #endif 338 }; 339 340 /// Default Load/Store Unit (LS Unit) for simulated processors. 341 /// 342 /// Each load (or store) consumes one entry in the load (or store) queue. 343 /// 344 /// Rules are: 345 /// 1) A younger load is allowed to pass an older load only if there are no 346 /// stores nor barriers in between the two loads. 347 /// 2) An younger store is not allowed to pass an older store. 348 /// 3) A younger store is not allowed to pass an older load. 349 /// 4) A younger load is allowed to pass an older store only if the load does 350 /// not alias with the store. 351 /// 352 /// This class optimistically assumes that loads don't alias store operations. 353 /// Under this assumption, younger loads are always allowed to pass older 354 /// stores (this would only affects rule 4). 355 /// Essentially, this class doesn't perform any sort alias analysis to 356 /// identify aliasing loads and stores. 357 /// 358 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be 359 /// set to `false` by the constructor of LSUnit. 360 /// 361 /// Note that this class doesn't know about the existence of different memory 362 /// types for memory operations (example: write-through, write-combining, etc.). 363 /// Derived classes are responsible for implementing that extra knowledge, and 364 /// provide different sets of rules for loads and stores by overriding method 365 /// `isReady()`. 366 /// To emulate a write-combining memory type, rule 2. must be relaxed in a 367 /// derived class to enable the reordering of non-aliasing store operations. 368 /// 369 /// No assumptions are made by this class on the size of the store buffer. This 370 /// class doesn't know how to identify cases where store-to-load forwarding may 371 /// occur. 372 /// 373 /// LSUnit doesn't attempt to predict whether a load or store hits or misses 374 /// the L1 cache. To be more specific, LSUnit doesn't know anything about 375 /// cache hierarchy and memory types. 376 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the 377 /// scheduling model provides an "optimistic" load-to-use latency (which usually 378 /// matches the load-to-use latency for when there is a hit in the L1D). 379 /// Derived classes may expand this knowledge. 380 /// 381 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor 382 /// memory-barrier like instructions. 383 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has 384 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it 385 /// serializes loads without forcing a flush of the load queue. 386 /// Similarly, instructions that both `mayStore` and have `unmodeled side 387 /// effects` are treated like store barriers. A full memory 388 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side 389 /// effects. This is obviously inaccurate, but this is the best that we can do 390 /// at the moment. 391 /// 392 /// Each load/store barrier consumes one entry in the load/store queue. A 393 /// load/store barrier enforces ordering of loads/stores: 394 /// - A younger load cannot pass a load barrier. 395 /// - A younger store cannot pass a store barrier. 396 /// 397 /// A younger load has to wait for the memory load barrier to execute. 398 /// A load/store barrier is "executed" when it becomes the oldest entry in 399 /// the load/store queue(s). That also means, all the older loads/stores have 400 /// already been executed. 401 class LSUnit : public LSUnitBase { 402 // This class doesn't know about the latency of a load instruction. So, it 403 // conservatively/pessimistically assumes that the latency of a load opcode 404 // matches the instruction latency. 405 // 406 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), 407 // and load/store conflicts, the latency of a load is determined by the depth 408 // of the load pipeline. So, we could use field `LoadLatency` in the 409 // MCSchedModel to model that latency. 410 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from 411 // L1D, and it usually already accounts for any extra latency due to data 412 // forwarding. 413 // When doing throughput analysis, `LoadLatency` is likely to 414 // be a better predictor of load latency than instruction latency. This is 415 // particularly true when simulating code with temporal/spatial locality of 416 // memory accesses. 417 // Using `LoadLatency` (instead of the instruction latency) is also expected 418 // to improve the load queue allocation for long latency instructions with 419 // folded memory operands (See PR39829). 420 // 421 // FIXME: On some processors, load/store operations are split into multiple 422 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but 423 // not 256-bit data types. So, a 256-bit load is effectively split into two 424 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For 425 // simplicity, this class optimistically assumes that a load instruction only 426 // consumes one entry in the LoadQueue. Similarly, store instructions only 427 // consume a single entry in the StoreQueue. 428 // In future, we should reassess the quality of this design, and consider 429 // alternative approaches that let instructions specify the number of 430 // load/store queue entries which they consume at dispatch stage (See 431 // PR39830). 432 // 433 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is 434 // conservatively treated as a store barrier. It forces older store to be 435 // executed before newer stores are issued. 436 // 437 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is 438 // conservatively treated as a load barrier. It forces older loads to execute 439 // before newer loads are issued. 440 unsigned CurrentLoadGroupID; 441 unsigned CurrentLoadBarrierGroupID; 442 unsigned CurrentStoreGroupID; 443 unsigned CurrentStoreBarrierGroupID; 444 445 public: LSUnit(const MCSchedModel & SM)446 LSUnit(const MCSchedModel &SM) 447 : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)448 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ) 449 : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)450 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) 451 : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), 452 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0), 453 CurrentStoreBarrierGroupID(0) {} 454 455 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 456 /// accomodate instruction IR. 457 Status isAvailable(const InstRef &IR) const override; 458 459 /// Allocates LS resources for instruction IR. 460 /// 461 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 462 /// returning LSU_AVAILABLE. 463 /// 464 /// Rules are: 465 /// By default, rules are: 466 /// 1. A store may not pass a previous store. 467 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set. 468 /// 3. A load may pass a previous load. 469 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias'). 470 /// 5. A load has to wait until an older load barrier is fully executed. 471 /// 6. A store has to wait until an older store barrier is fully executed. 472 unsigned dispatch(const InstRef &IR) override; 473 474 void onInstructionExecuted(const InstRef &IR) override; 475 }; 476 477 } // namespace mca 478 } // namespace llvm 479 480 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H 481