1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 
24 namespace llvm {
25 namespace mca {
26 
27 /// A node of a memory dependency graph. A MemoryGroup describes a set of
28 /// instructions with same memory dependencies.
29 ///
30 /// By construction, instructions of a MemoryGroup don't depend on each other.
31 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32 /// A Memory group identifier is then stored as a "token" in field
33 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
34 /// internally by the LSUnit to track memory dependencies.
35 class MemoryGroup {
36   unsigned NumPredecessors;
37   unsigned NumExecutingPredecessors;
38   unsigned NumExecutedPredecessors;
39 
40   unsigned NumInstructions;
41   unsigned NumExecuting;
42   unsigned NumExecuted;
43   // Successors that are in a order dependency with this group.
44   SmallVector<MemoryGroup *, 4> OrderSucc;
45   // Successors that are in a data dependency with this group.
46   SmallVector<MemoryGroup *, 4> DataSucc;
47 
48   CriticalDependency CriticalPredecessor;
49   InstRef CriticalMemoryInstruction;
50 
51   MemoryGroup(const MemoryGroup &) = delete;
52   MemoryGroup &operator=(const MemoryGroup &) = delete;
53 
54 public:
MemoryGroup()55   MemoryGroup()
56       : NumPredecessors(0), NumExecutingPredecessors(0),
57         NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
58         NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
59   MemoryGroup(MemoryGroup &&) = default;
60 
getNumSuccessors()61   size_t getNumSuccessors() const {
62     return OrderSucc.size() + DataSucc.size();
63   }
getNumPredecessors()64   unsigned getNumPredecessors() const { return NumPredecessors; }
getNumExecutingPredecessors()65   unsigned getNumExecutingPredecessors() const {
66     return NumExecutingPredecessors;
67   }
getNumExecutedPredecessors()68   unsigned getNumExecutedPredecessors() const {
69     return NumExecutedPredecessors;
70   }
getNumInstructions()71   unsigned getNumInstructions() const { return NumInstructions; }
getNumExecuting()72   unsigned getNumExecuting() const { return NumExecuting; }
getNumExecuted()73   unsigned getNumExecuted() const { return NumExecuted; }
74 
getCriticalMemoryInstruction()75   const InstRef &getCriticalMemoryInstruction() const {
76     return CriticalMemoryInstruction;
77   }
getCriticalPredecessor()78   const CriticalDependency &getCriticalPredecessor() const {
79     return CriticalPredecessor;
80   }
81 
addSuccessor(MemoryGroup * Group,bool IsDataDependent)82   void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
83     // Do not need to add a dependency if there is no data
84     // dependency and all instructions from this group have been
85     // issued already.
86     if (!IsDataDependent && isExecuting())
87       return;
88 
89     Group->NumPredecessors++;
90     assert(!isExecuted() && "Should have been removed!");
91     if (isExecuting())
92       Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
93 
94     if (IsDataDependent)
95       DataSucc.emplace_back(Group);
96     else
97       OrderSucc.emplace_back(Group);
98   }
99 
isWaiting()100   bool isWaiting() const {
101     return NumPredecessors >
102            (NumExecutingPredecessors + NumExecutedPredecessors);
103   }
isPending()104   bool isPending() const {
105     return NumExecutingPredecessors &&
106            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
107             NumPredecessors);
108   }
isReady()109   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
isExecuting()110   bool isExecuting() const {
111     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
112   }
isExecuted()113   bool isExecuted() const { return NumInstructions == NumExecuted; }
114 
onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)115   void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
116     assert(!isReady() && "Unexpected group-start event!");
117     NumExecutingPredecessors++;
118 
119     if (!ShouldUpdateCriticalDep)
120       return;
121 
122     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
123     if (CriticalPredecessor.Cycles < Cycles) {
124       CriticalPredecessor.IID = IR.getSourceIndex();
125       CriticalPredecessor.Cycles = Cycles;
126     }
127   }
128 
onGroupExecuted()129   void onGroupExecuted() {
130     assert(!isReady() && "Inconsistent state found!");
131     NumExecutingPredecessors--;
132     NumExecutedPredecessors++;
133   }
134 
onInstructionIssued(const InstRef & IR)135   void onInstructionIssued(const InstRef &IR) {
136     assert(!isExecuting() && "Invalid internal state!");
137     ++NumExecuting;
138 
139     // update the CriticalMemDep.
140     const Instruction &IS = *IR.getInstruction();
141     if ((bool)CriticalMemoryInstruction) {
142       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
143       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
144         CriticalMemoryInstruction = IR;
145     } else {
146       CriticalMemoryInstruction = IR;
147     }
148 
149     if (!isExecuting())
150       return;
151 
152     // Notify successors that this group started execution.
153     for (MemoryGroup *MG : OrderSucc) {
154       MG->onGroupIssued(CriticalMemoryInstruction, false);
155       // Release the order dependency with this group.
156       MG->onGroupExecuted();
157     }
158 
159     for (MemoryGroup *MG : DataSucc)
160       MG->onGroupIssued(CriticalMemoryInstruction, true);
161   }
162 
onInstructionExecuted(const InstRef & IR)163   void onInstructionExecuted(const InstRef &IR) {
164     assert(isReady() && !isExecuted() && "Invalid internal state!");
165     --NumExecuting;
166     ++NumExecuted;
167 
168     if (CriticalMemoryInstruction &&
169         CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
170       CriticalMemoryInstruction.invalidate();
171     }
172 
173     if (!isExecuted())
174       return;
175 
176     // Notify data dependent successors that this group has finished execution.
177     for (MemoryGroup *MG : DataSucc)
178       MG->onGroupExecuted();
179   }
180 
addInstruction()181   void addInstruction() {
182     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
183     ++NumInstructions;
184   }
185 
cycleEvent()186   void cycleEvent() {
187     if (isWaiting() && CriticalPredecessor.Cycles)
188       CriticalPredecessor.Cycles--;
189   }
190 };
191 
192 /// Abstract base interface for LS (load/store) units in llvm-mca.
193 class LSUnitBase : public HardwareUnit {
194   /// Load queue size.
195   ///
196   /// A value of zero for this field means that the load queue is unbounded.
197   /// Processor models can declare the size of a load queue via tablegen (see
198   /// the definition of tablegen class LoadQueue in
199   /// llvm/Target/TargetSchedule.td).
200   unsigned LQSize;
201 
202   /// Load queue size.
203   ///
204   /// A value of zero for this field means that the store queue is unbounded.
205   /// Processor models can declare the size of a store queue via tablegen (see
206   /// the definition of tablegen class StoreQueue in
207   /// llvm/Target/TargetSchedule.td).
208   unsigned SQSize;
209 
210   unsigned UsedLQEntries;
211   unsigned UsedSQEntries;
212 
213   /// True if loads don't alias with stores.
214   ///
215   /// By default, the LS unit assumes that loads and stores don't alias with
216   /// eachother. If this field is set to false, then loads are always assumed to
217   /// alias with stores.
218   const bool NoAlias;
219 
220   /// Used to map group identifiers to MemoryGroups.
221   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
222   unsigned NextGroupID;
223 
224 public:
225   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
226              unsigned StoreQueueSize, bool AssumeNoAlias);
227 
228   virtual ~LSUnitBase();
229 
230   /// Returns the total number of entries in the load queue.
getLoadQueueSize()231   unsigned getLoadQueueSize() const { return LQSize; }
232 
233   /// Returns the total number of entries in the store queue.
getStoreQueueSize()234   unsigned getStoreQueueSize() const { return SQSize; }
235 
getUsedLQEntries()236   unsigned getUsedLQEntries() const { return UsedLQEntries; }
getUsedSQEntries()237   unsigned getUsedSQEntries() const { return UsedSQEntries; }
acquireLQSlot()238   void acquireLQSlot() { ++UsedLQEntries; }
acquireSQSlot()239   void acquireSQSlot() { ++UsedSQEntries; }
releaseLQSlot()240   void releaseLQSlot() { --UsedLQEntries; }
releaseSQSlot()241   void releaseSQSlot() { --UsedSQEntries; }
242 
assumeNoAlias()243   bool assumeNoAlias() const { return NoAlias; }
244 
245   enum Status {
246     LSU_AVAILABLE = 0,
247     LSU_LQUEUE_FULL, // Load Queue unavailable
248     LSU_SQUEUE_FULL  // Store Queue unavailable
249   };
250 
251   /// This method checks the availability of the load/store buffers.
252   ///
253   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
254   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
255   /// not a memory operation.
256   virtual Status isAvailable(const InstRef &IR) const = 0;
257 
258   /// Allocates LS resources for instruction IR.
259   ///
260   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
261   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
262   /// Returns the GroupID associated with this instruction. That value will be
263   /// used to set the LSUTokenID field in class Instruction.
264   virtual unsigned dispatch(const InstRef &IR) = 0;
265 
isSQEmpty()266   bool isSQEmpty() const { return !UsedSQEntries; }
isLQEmpty()267   bool isLQEmpty() const { return !UsedLQEntries; }
isSQFull()268   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
isLQFull()269   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
270 
isValidGroupID(unsigned Index)271   bool isValidGroupID(unsigned Index) const {
272     return Index && (Groups.find(Index) != Groups.end());
273   }
274 
275   /// Check if a peviously dispatched instruction IR is now ready for execution.
isReady(const InstRef & IR)276   bool isReady(const InstRef &IR) const {
277     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
278     const MemoryGroup &Group = getGroup(GroupID);
279     return Group.isReady();
280   }
281 
282   /// Check if instruction IR only depends on memory instructions that are
283   /// currently executing.
isPending(const InstRef & IR)284   bool isPending(const InstRef &IR) const {
285     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
286     const MemoryGroup &Group = getGroup(GroupID);
287     return Group.isPending();
288   }
289 
290   /// Check if instruction IR is still waiting on memory operations, and the
291   /// wait time is still unknown.
isWaiting(const InstRef & IR)292   bool isWaiting(const InstRef &IR) const {
293     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
294     const MemoryGroup &Group = getGroup(GroupID);
295     return Group.isWaiting();
296   }
297 
hasDependentUsers(const InstRef & IR)298   bool hasDependentUsers(const InstRef &IR) const {
299     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
300     const MemoryGroup &Group = getGroup(GroupID);
301     return !Group.isExecuted() && Group.getNumSuccessors();
302   }
303 
getGroup(unsigned Index)304   const MemoryGroup &getGroup(unsigned Index) const {
305     assert(isValidGroupID(Index) && "Group doesn't exist!");
306     return *Groups.find(Index)->second;
307   }
308 
getGroup(unsigned Index)309   MemoryGroup &getGroup(unsigned Index) {
310     assert(isValidGroupID(Index) && "Group doesn't exist!");
311     return *Groups.find(Index)->second;
312   }
313 
createMemoryGroup()314   unsigned createMemoryGroup() {
315     Groups.insert(
316         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
317     return NextGroupID++;
318   }
319 
320   virtual void onInstructionExecuted(const InstRef &IR);
321 
322   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
323   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
324   // By default we conservatively assume that the LDQ receives a load at
325   // dispatch. Loads leave the LDQ at retirement stage.
326   virtual void onInstructionRetired(const InstRef &IR);
327 
onInstructionIssued(const InstRef & IR)328   virtual void onInstructionIssued(const InstRef &IR) {
329     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
330     Groups[GroupID]->onInstructionIssued(IR);
331   }
332 
333   virtual void cycleEvent();
334 
335 #ifndef NDEBUG
336   void dump() const;
337 #endif
338 };
339 
340 /// Default Load/Store Unit (LS Unit) for simulated processors.
341 ///
342 /// Each load (or store) consumes one entry in the load (or store) queue.
343 ///
344 /// Rules are:
345 /// 1) A younger load is allowed to pass an older load only if there are no
346 ///    stores nor barriers in between the two loads.
347 /// 2) An younger store is not allowed to pass an older store.
348 /// 3) A younger store is not allowed to pass an older load.
349 /// 4) A younger load is allowed to pass an older store only if the load does
350 ///    not alias with the store.
351 ///
352 /// This class optimistically assumes that loads don't alias store operations.
353 /// Under this assumption, younger loads are always allowed to pass older
354 /// stores (this would only affects rule 4).
355 /// Essentially, this class doesn't perform any sort alias analysis to
356 /// identify aliasing loads and stores.
357 ///
358 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
359 /// set to `false` by the constructor of LSUnit.
360 ///
361 /// Note that this class doesn't know about the existence of different memory
362 /// types for memory operations (example: write-through, write-combining, etc.).
363 /// Derived classes are responsible for implementing that extra knowledge, and
364 /// provide different sets of rules for loads and stores by overriding method
365 /// `isReady()`.
366 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
367 /// derived class to enable the reordering of non-aliasing store operations.
368 ///
369 /// No assumptions are made by this class on the size of the store buffer.  This
370 /// class doesn't know how to identify cases where store-to-load forwarding may
371 /// occur.
372 ///
373 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
374 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
375 /// cache hierarchy and memory types.
376 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
377 /// scheduling model provides an "optimistic" load-to-use latency (which usually
378 /// matches the load-to-use latency for when there is a hit in the L1D).
379 /// Derived classes may expand this knowledge.
380 ///
381 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
382 /// memory-barrier like instructions.
383 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
384 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
385 /// serializes loads without forcing a flush of the load queue.
386 /// Similarly, instructions that both `mayStore` and have `unmodeled side
387 /// effects` are treated like store barriers. A full memory
388 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
389 /// effects. This is obviously inaccurate, but this is the best that we can do
390 /// at the moment.
391 ///
392 /// Each load/store barrier consumes one entry in the load/store queue. A
393 /// load/store barrier enforces ordering of loads/stores:
394 ///  - A younger load cannot pass a load barrier.
395 ///  - A younger store cannot pass a store barrier.
396 ///
397 /// A younger load has to wait for the memory load barrier to execute.
398 /// A load/store barrier is "executed" when it becomes the oldest entry in
399 /// the load/store queue(s). That also means, all the older loads/stores have
400 /// already been executed.
401 class LSUnit : public LSUnitBase {
402   // This class doesn't know about the latency of a load instruction. So, it
403   // conservatively/pessimistically assumes that the latency of a load opcode
404   // matches the instruction latency.
405   //
406   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
407   // and load/store conflicts, the latency of a load is determined by the depth
408   // of the load pipeline. So, we could use field `LoadLatency` in the
409   // MCSchedModel to model that latency.
410   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
411   // L1D, and it usually already accounts for any extra latency due to data
412   // forwarding.
413   // When doing throughput analysis, `LoadLatency` is likely to
414   // be a better predictor of load latency than instruction latency. This is
415   // particularly true when simulating code with temporal/spatial locality of
416   // memory accesses.
417   // Using `LoadLatency` (instead of the instruction latency) is also expected
418   // to improve the load queue allocation for long latency instructions with
419   // folded memory operands (See PR39829).
420   //
421   // FIXME: On some processors, load/store operations are split into multiple
422   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
423   // not 256-bit data types. So, a 256-bit load is effectively split into two
424   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
425   // simplicity, this class optimistically assumes that a load instruction only
426   // consumes one entry in the LoadQueue.  Similarly, store instructions only
427   // consume a single entry in the StoreQueue.
428   // In future, we should reassess the quality of this design, and consider
429   // alternative approaches that let instructions specify the number of
430   // load/store queue entries which they consume at dispatch stage (See
431   // PR39830).
432   //
433   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
434   // conservatively treated as a store barrier. It forces older store to be
435   // executed before newer stores are issued.
436   //
437   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
438   // conservatively treated as a load barrier. It forces older loads to execute
439   // before newer loads are issued.
440   unsigned CurrentLoadGroupID;
441   unsigned CurrentLoadBarrierGroupID;
442   unsigned CurrentStoreGroupID;
443   unsigned CurrentStoreBarrierGroupID;
444 
445 public:
LSUnit(const MCSchedModel & SM)446   LSUnit(const MCSchedModel &SM)
447       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)448   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
449       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)450   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
451       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
452         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
453         CurrentStoreBarrierGroupID(0) {}
454 
455   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
456   /// accomodate instruction IR.
457   Status isAvailable(const InstRef &IR) const override;
458 
459   /// Allocates LS resources for instruction IR.
460   ///
461   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
462   /// returning LSU_AVAILABLE.
463   ///
464   /// Rules are:
465   /// By default, rules are:
466   /// 1. A store may not pass a previous store.
467   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
468   /// 3. A load may pass a previous load.
469   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
470   /// 5. A load has to wait until an older load barrier is fully executed.
471   /// 6. A store has to wait until an older store barrier is fully executed.
472   unsigned dispatch(const InstRef &IR) override;
473 
474   void onInstructionExecuted(const InstRef &IR) override;
475 };
476 
477 } // namespace mca
478 } // namespace llvm
479 
480 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
481