1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_LSUNIT_H
16 #define LLVM_MCA_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 
24 namespace llvm {
25 namespace mca {
26 
27 class Scheduler;
28 
29 /// A node of a memory dependency graph. A MemoryGroup describes a set of
30 /// instructions with same memory dependencies.
31 ///
32 /// By construction, instructions of a MemoryGroup don't depend on each other.
33 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
34 /// A Memory group identifier is then stored as a "token" in field
35 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
36 /// internally by the LSUnit to track memory dependencies.
37 class MemoryGroup {
38   unsigned NumPredecessors;
39   unsigned NumExecutingPredecessors;
40   unsigned NumExecutedPredecessors;
41 
42   unsigned NumInstructions;
43   unsigned NumExecuting;
44   unsigned NumExecuted;
45   SmallVector<MemoryGroup *, 4> Succ;
46 
47   CriticalDependency CriticalPredecessor;
48   InstRef CriticalMemoryInstruction;
49 
50   MemoryGroup(const MemoryGroup &) = delete;
51   MemoryGroup &operator=(const MemoryGroup &) = delete;
52 
53 public:
54   MemoryGroup()
55       : NumPredecessors(0), NumExecutingPredecessors(0),
56         NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
57         NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
58   MemoryGroup(MemoryGroup &&) = default;
59 
60   ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
61   unsigned getNumSuccessors() const { return Succ.size(); }
62   unsigned getNumPredecessors() const { return NumPredecessors; }
63   unsigned getNumExecutingPredecessors() const {
64     return NumExecutingPredecessors;
65   }
66   unsigned getNumExecutedPredecessors() const {
67     return NumExecutedPredecessors;
68   }
69   unsigned getNumInstructions() const { return NumInstructions; }
70   unsigned getNumExecuting() const { return NumExecuting; }
71   unsigned getNumExecuted() const { return NumExecuted; }
72 
73   const InstRef &getCriticalMemoryInstruction() const {
74     return CriticalMemoryInstruction;
75   }
76   const CriticalDependency &getCriticalPredecessor() const {
77     return CriticalPredecessor;
78   }
79 
80   void addSuccessor(MemoryGroup *Group) {
81     Group->NumPredecessors++;
82     assert(!isExecuted() && "Should have been removed!");
83     if (isExecuting())
84       Group->onGroupIssued(CriticalMemoryInstruction);
85     Succ.emplace_back(Group);
86   }
87 
88   bool isWaiting() const {
89     return NumPredecessors >
90            (NumExecutingPredecessors + NumExecutedPredecessors);
91   }
92   bool isPending() const {
93     return NumExecutingPredecessors &&
94            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
95             NumPredecessors);
96   }
97   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
98   bool isExecuting() const {
99     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
100   }
101   bool isExecuted() const { return NumInstructions == NumExecuted; }
102 
103   void onGroupIssued(const InstRef &IR) {
104     assert(!isReady() && "Unexpected group-start event!");
105     NumExecutingPredecessors++;
106 
107     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
108     if (CriticalPredecessor.Cycles < Cycles) {
109       CriticalPredecessor.IID = IR.getSourceIndex();
110       CriticalPredecessor.Cycles = Cycles;
111     }
112   }
113 
114   void onGroupExecuted() {
115     assert(!isReady() && "Inconsistent state found!");
116     NumExecutingPredecessors--;
117     NumExecutedPredecessors++;
118   }
119 
120   void onInstructionIssued(const InstRef &IR) {
121     assert(!isExecuting() && "Invalid internal state!");
122     ++NumExecuting;
123 
124     // update the CriticalMemDep.
125     const Instruction &IS = *IR.getInstruction();
126     if ((bool)CriticalMemoryInstruction) {
127       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
128       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
129         CriticalMemoryInstruction = IR;
130     } else {
131       CriticalMemoryInstruction = IR;
132     }
133 
134     if (!isExecuting())
135       return;
136 
137     // Notify successors that this group started execution.
138     for (MemoryGroup *MG : Succ)
139       MG->onGroupIssued(CriticalMemoryInstruction);
140   }
141 
142   void onInstructionExecuted() {
143     assert(isReady() && !isExecuted() && "Invalid internal state!");
144     --NumExecuting;
145     ++NumExecuted;
146 
147     if (!isExecuted())
148       return;
149 
150     // Notify successors that this group has finished execution.
151     for (MemoryGroup *MG : Succ)
152       MG->onGroupExecuted();
153   }
154 
155   void addInstruction() {
156     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
157     ++NumInstructions;
158   }
159 
160   void cycleEvent() {
161     if (isWaiting() && CriticalPredecessor.Cycles)
162       CriticalPredecessor.Cycles--;
163   }
164 };
165 
166 /// Abstract base interface for LS (load/store) units in llvm-mca.
167 class LSUnitBase : public HardwareUnit {
168   /// Load queue size.
169   ///
170   /// A value of zero for this field means that the load queue is unbounded.
171   /// Processor models can declare the size of a load queue via tablegen (see
172   /// the definition of tablegen class LoadQueue in
173   /// llvm/Target/TargetSchedule.td).
174   unsigned LQSize;
175 
176   /// Load queue size.
177   ///
178   /// A value of zero for this field means that the store queue is unbounded.
179   /// Processor models can declare the size of a store queue via tablegen (see
180   /// the definition of tablegen class StoreQueue in
181   /// llvm/Target/TargetSchedule.td).
182   unsigned SQSize;
183 
184   unsigned UsedLQEntries;
185   unsigned UsedSQEntries;
186 
187   /// True if loads don't alias with stores.
188   ///
189   /// By default, the LS unit assumes that loads and stores don't alias with
190   /// eachother. If this field is set to false, then loads are always assumed to
191   /// alias with stores.
192   const bool NoAlias;
193 
194   /// Used to map group identifiers to MemoryGroups.
195   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
196   unsigned NextGroupID;
197 
198 public:
199   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
200              unsigned StoreQueueSize, bool AssumeNoAlias);
201 
202   virtual ~LSUnitBase();
203 
204   /// Returns the total number of entries in the load queue.
205   unsigned getLoadQueueSize() const { return LQSize; }
206 
207   /// Returns the total number of entries in the store queue.
208   unsigned getStoreQueueSize() const { return SQSize; }
209 
210   unsigned getUsedLQEntries() const { return UsedLQEntries; }
211   unsigned getUsedSQEntries() const { return UsedSQEntries; }
212   void acquireLQSlot() { ++UsedLQEntries; }
213   void acquireSQSlot() { ++UsedSQEntries; }
214   void releaseLQSlot() { --UsedLQEntries; }
215   void releaseSQSlot() { --UsedSQEntries; }
216 
217   bool assumeNoAlias() const { return NoAlias; }
218 
219   enum Status {
220     LSU_AVAILABLE = 0,
221     LSU_LQUEUE_FULL, // Load Queue unavailable
222     LSU_SQUEUE_FULL  // Store Queue unavailable
223   };
224 
225   /// This method checks the availability of the load/store buffers.
226   ///
227   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
228   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
229   /// not a memory operation.
230   virtual Status isAvailable(const InstRef &IR) const = 0;
231 
232   /// Allocates LS resources for instruction IR.
233   ///
234   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
235   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
236   /// Returns the GroupID associated with this instruction. That value will be
237   /// used to set the LSUTokenID field in class Instruction.
238   virtual unsigned dispatch(const InstRef &IR) = 0;
239 
240   bool isSQEmpty() const { return !UsedSQEntries; }
241   bool isLQEmpty() const { return !UsedLQEntries; }
242   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
243   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
244 
245   bool isValidGroupID(unsigned Index) const {
246     return Index && (Groups.find(Index) != Groups.end());
247   }
248 
249   /// Check if a peviously dispatched instruction IR is now ready for execution.
250   bool isReady(const InstRef &IR) const {
251     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
252     const MemoryGroup &Group = getGroup(GroupID);
253     return Group.isReady();
254   }
255 
256   /// Check if instruction IR only depends on memory instructions that are
257   /// currently executing.
258   bool isPending(const InstRef &IR) const {
259     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
260     const MemoryGroup &Group = getGroup(GroupID);
261     return Group.isPending();
262   }
263 
264   /// Check if instruction IR is still waiting on memory operations, and the
265   /// wait time is still unknown.
266   bool isWaiting(const InstRef &IR) const {
267     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
268     const MemoryGroup &Group = getGroup(GroupID);
269     return Group.isWaiting();
270   }
271 
272   bool hasDependentUsers(const InstRef &IR) const {
273     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
274     const MemoryGroup &Group = getGroup(GroupID);
275     return !Group.isExecuted() && Group.getNumSuccessors();
276   }
277 
278   const MemoryGroup &getGroup(unsigned Index) const {
279     assert(isValidGroupID(Index) && "Group doesn't exist!");
280     return *Groups.find(Index)->second;
281   }
282 
283   MemoryGroup &getGroup(unsigned Index) {
284     assert(isValidGroupID(Index) && "Group doesn't exist!");
285     return *Groups.find(Index)->second;
286   }
287 
288   unsigned createMemoryGroup() {
289     Groups.insert(
290         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
291     return NextGroupID++;
292   }
293 
294   virtual void onInstructionExecuted(const InstRef &IR);
295 
296   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
297   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
298   // By default we conservatively assume that the LDQ receives a load at
299   // dispatch. Loads leave the LDQ at retirement stage.
300   virtual void onInstructionRetired(const InstRef &IR);
301 
302   virtual void onInstructionIssued(const InstRef &IR) {
303     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
304     Groups[GroupID]->onInstructionIssued(IR);
305   }
306 
307   virtual void cycleEvent();
308 
309 #ifndef NDEBUG
310   void dump() const;
311 #endif
312 };
313 
314 /// Default Load/Store Unit (LS Unit) for simulated processors.
315 ///
316 /// Each load (or store) consumes one entry in the load (or store) queue.
317 ///
318 /// Rules are:
319 /// 1) A younger load is allowed to pass an older load only if there are no
320 ///    stores nor barriers in between the two loads.
321 /// 2) An younger store is not allowed to pass an older store.
322 /// 3) A younger store is not allowed to pass an older load.
323 /// 4) A younger load is allowed to pass an older store only if the load does
324 ///    not alias with the store.
325 ///
326 /// This class optimistically assumes that loads don't alias store operations.
327 /// Under this assumption, younger loads are always allowed to pass older
328 /// stores (this would only affects rule 4).
329 /// Essentially, this class doesn't perform any sort alias analysis to
330 /// identify aliasing loads and stores.
331 ///
332 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
333 /// set to `false` by the constructor of LSUnit.
334 ///
335 /// Note that this class doesn't know about the existence of different memory
336 /// types for memory operations (example: write-through, write-combining, etc.).
337 /// Derived classes are responsible for implementing that extra knowledge, and
338 /// provide different sets of rules for loads and stores by overriding method
339 /// `isReady()`.
340 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
341 /// derived class to enable the reordering of non-aliasing store operations.
342 ///
343 /// No assumptions are made by this class on the size of the store buffer.  This
344 /// class doesn't know how to identify cases where store-to-load forwarding may
345 /// occur.
346 ///
347 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
348 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
349 /// cache hierarchy and memory types.
350 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
351 /// scheduling model provides an "optimistic" load-to-use latency (which usually
352 /// matches the load-to-use latency for when there is a hit in the L1D).
353 /// Derived classes may expand this knowledge.
354 ///
355 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
356 /// memory-barrier like instructions.
357 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
358 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
359 /// serializes loads without forcing a flush of the load queue.
360 /// Similarly, instructions that both `mayStore` and have `unmodeled side
361 /// effects` are treated like store barriers. A full memory
362 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
363 /// effects. This is obviously inaccurate, but this is the best that we can do
364 /// at the moment.
365 ///
366 /// Each load/store barrier consumes one entry in the load/store queue. A
367 /// load/store barrier enforces ordering of loads/stores:
368 ///  - A younger load cannot pass a load barrier.
369 ///  - A younger store cannot pass a store barrier.
370 ///
371 /// A younger load has to wait for the memory load barrier to execute.
372 /// A load/store barrier is "executed" when it becomes the oldest entry in
373 /// the load/store queue(s). That also means, all the older loads/stores have
374 /// already been executed.
375 class LSUnit : public LSUnitBase {
376   // This class doesn't know about the latency of a load instruction. So, it
377   // conservatively/pessimistically assumes that the latency of a load opcode
378   // matches the instruction latency.
379   //
380   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
381   // and load/store conflicts, the latency of a load is determined by the depth
382   // of the load pipeline. So, we could use field `LoadLatency` in the
383   // MCSchedModel to model that latency.
384   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
385   // L1D, and it usually already accounts for any extra latency due to data
386   // forwarding.
387   // When doing throughput analysis, `LoadLatency` is likely to
388   // be a better predictor of load latency than instruction latency. This is
389   // particularly true when simulating code with temporal/spatial locality of
390   // memory accesses.
391   // Using `LoadLatency` (instead of the instruction latency) is also expected
392   // to improve the load queue allocation for long latency instructions with
393   // folded memory operands (See PR39829).
394   //
395   // FIXME: On some processors, load/store operations are split into multiple
396   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
397   // not 256-bit data types. So, a 256-bit load is effectively split into two
398   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
399   // simplicity, this class optimistically assumes that a load instruction only
400   // consumes one entry in the LoadQueue.  Similarly, store instructions only
401   // consume a single entry in the StoreQueue.
402   // In future, we should reassess the quality of this design, and consider
403   // alternative approaches that let instructions specify the number of
404   // load/store queue entries which they consume at dispatch stage (See
405   // PR39830).
406   //
407   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
408   // conservatively treated as a store barrier. It forces older store to be
409   // executed before newer stores are issued.
410   //
411   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
412   // conservatively treated as a load barrier. It forces older loads to execute
413   // before newer loads are issued.
414   unsigned CurrentLoadGroupID;
415   unsigned CurrentLoadBarrierGroupID;
416   unsigned CurrentStoreGroupID;
417 
418 public:
419   LSUnit(const MCSchedModel &SM)
420       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
421   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
422       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
423   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
424       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
425         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
426 
427   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
428   /// accomodate instruction IR.
429   Status isAvailable(const InstRef &IR) const override;
430 
431   /// Allocates LS resources for instruction IR.
432   ///
433   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
434   /// returning LSU_AVAILABLE.
435   ///
436   /// Rules are:
437   /// By default, rules are:
438   /// 1. A store may not pass a previous store.
439   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
440   /// 3. A load may pass a previous load.
441   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
442   /// 5. A load has to wait until an older load barrier is fully executed.
443   /// 6. A store has to wait until an older store barrier is fully executed.
444   unsigned dispatch(const InstRef &IR) override;
445 
446   void onInstructionExecuted(const InstRef &IR) override;
447 };
448 
449 } // namespace mca
450 } // namespace llvm
451 
452 #endif // LLVM_MCA_LSUNIT_H
453