1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_LSUNIT_H
16 #define LLVM_MCA_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 
24 namespace llvm {
25 namespace mca {
26 
27 /// A node of a memory dependency graph. A MemoryGroup describes a set of
28 /// instructions with same memory dependencies.
29 ///
30 /// By construction, instructions of a MemoryGroup don't depend on each other.
31 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32 /// A Memory group identifier is then stored as a "token" in field
33 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
34 /// internally by the LSUnit to track memory dependencies.
35 class MemoryGroup {
36   unsigned NumPredecessors;
37   unsigned NumExecutingPredecessors;
38   unsigned NumExecutedPredecessors;
39 
40   unsigned NumInstructions;
41   unsigned NumExecuting;
42   unsigned NumExecuted;
43   // Successors that are in a order dependency with this group.
44   SmallVector<MemoryGroup *, 4> OrderSucc;
45   // Successors that are in a data dependency with this group.
46   SmallVector<MemoryGroup *, 4> DataSucc;
47 
48   CriticalDependency CriticalPredecessor;
49   InstRef CriticalMemoryInstruction;
50 
51   MemoryGroup(const MemoryGroup &) = delete;
52   MemoryGroup &operator=(const MemoryGroup &) = delete;
53 
54 public:
55   MemoryGroup()
56       : NumPredecessors(0), NumExecutingPredecessors(0),
57         NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
58         NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
59   MemoryGroup(MemoryGroup &&) = default;
60 
61   size_t getNumSuccessors() const {
62     return OrderSucc.size() + DataSucc.size();
63   }
64   unsigned getNumPredecessors() const { return NumPredecessors; }
65   unsigned getNumExecutingPredecessors() const {
66     return NumExecutingPredecessors;
67   }
68   unsigned getNumExecutedPredecessors() const {
69     return NumExecutedPredecessors;
70   }
71   unsigned getNumInstructions() const { return NumInstructions; }
72   unsigned getNumExecuting() const { return NumExecuting; }
73   unsigned getNumExecuted() const { return NumExecuted; }
74 
75   const InstRef &getCriticalMemoryInstruction() const {
76     return CriticalMemoryInstruction;
77   }
78   const CriticalDependency &getCriticalPredecessor() const {
79     return CriticalPredecessor;
80   }
81 
82   void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
83     // Do not need to add a dependency if there is no data
84     // dependency and all instructions from this group have been
85     // issued already.
86     if (!IsDataDependent && isExecuting())
87       return;
88 
89     Group->NumPredecessors++;
90     assert(!isExecuted() && "Should have been removed!");
91     if (isExecuting())
92       Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
93 
94     if (IsDataDependent)
95       DataSucc.emplace_back(Group);
96     else
97       OrderSucc.emplace_back(Group);
98   }
99 
100   bool isWaiting() const {
101     return NumPredecessors >
102            (NumExecutingPredecessors + NumExecutedPredecessors);
103   }
104   bool isPending() const {
105     return NumExecutingPredecessors &&
106            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
107             NumPredecessors);
108   }
109   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
110   bool isExecuting() const {
111     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
112   }
113   bool isExecuted() const { return NumInstructions == NumExecuted; }
114 
115   void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
116     assert(!isReady() && "Unexpected group-start event!");
117     NumExecutingPredecessors++;
118 
119     if (!ShouldUpdateCriticalDep)
120       return;
121 
122     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
123     if (CriticalPredecessor.Cycles < Cycles) {
124       CriticalPredecessor.IID = IR.getSourceIndex();
125       CriticalPredecessor.Cycles = Cycles;
126     }
127   }
128 
129   void onGroupExecuted() {
130     assert(!isReady() && "Inconsistent state found!");
131     NumExecutingPredecessors--;
132     NumExecutedPredecessors++;
133   }
134 
135   void onInstructionIssued(const InstRef &IR) {
136     assert(!isExecuting() && "Invalid internal state!");
137     ++NumExecuting;
138 
139     // update the CriticalMemDep.
140     const Instruction &IS = *IR.getInstruction();
141     if ((bool)CriticalMemoryInstruction) {
142       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
143       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
144         CriticalMemoryInstruction = IR;
145     } else {
146       CriticalMemoryInstruction = IR;
147     }
148 
149     if (!isExecuting())
150       return;
151 
152     // Notify successors that this group started execution.
153     for (MemoryGroup *MG : OrderSucc) {
154       MG->onGroupIssued(CriticalMemoryInstruction, false);
155       // Release the order dependency with this group.
156       MG->onGroupExecuted();
157     }
158 
159     for (MemoryGroup *MG : DataSucc)
160       MG->onGroupIssued(CriticalMemoryInstruction, true);
161   }
162 
163   void onInstructionExecuted() {
164     assert(isReady() && !isExecuted() && "Invalid internal state!");
165     --NumExecuting;
166     ++NumExecuted;
167 
168     if (!isExecuted())
169       return;
170 
171     // Notify data dependent successors that this group has finished execution.
172     for (MemoryGroup *MG : DataSucc)
173       MG->onGroupExecuted();
174   }
175 
176   void addInstruction() {
177     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
178     ++NumInstructions;
179   }
180 
181   void cycleEvent() {
182     if (isWaiting() && CriticalPredecessor.Cycles)
183       CriticalPredecessor.Cycles--;
184   }
185 };
186 
187 /// Abstract base interface for LS (load/store) units in llvm-mca.
188 class LSUnitBase : public HardwareUnit {
189   /// Load queue size.
190   ///
191   /// A value of zero for this field means that the load queue is unbounded.
192   /// Processor models can declare the size of a load queue via tablegen (see
193   /// the definition of tablegen class LoadQueue in
194   /// llvm/Target/TargetSchedule.td).
195   unsigned LQSize;
196 
197   /// Load queue size.
198   ///
199   /// A value of zero for this field means that the store queue is unbounded.
200   /// Processor models can declare the size of a store queue via tablegen (see
201   /// the definition of tablegen class StoreQueue in
202   /// llvm/Target/TargetSchedule.td).
203   unsigned SQSize;
204 
205   unsigned UsedLQEntries;
206   unsigned UsedSQEntries;
207 
208   /// True if loads don't alias with stores.
209   ///
210   /// By default, the LS unit assumes that loads and stores don't alias with
211   /// eachother. If this field is set to false, then loads are always assumed to
212   /// alias with stores.
213   const bool NoAlias;
214 
215   /// Used to map group identifiers to MemoryGroups.
216   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
217   unsigned NextGroupID;
218 
219 public:
220   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
221              unsigned StoreQueueSize, bool AssumeNoAlias);
222 
223   virtual ~LSUnitBase();
224 
225   /// Returns the total number of entries in the load queue.
226   unsigned getLoadQueueSize() const { return LQSize; }
227 
228   /// Returns the total number of entries in the store queue.
229   unsigned getStoreQueueSize() const { return SQSize; }
230 
231   unsigned getUsedLQEntries() const { return UsedLQEntries; }
232   unsigned getUsedSQEntries() const { return UsedSQEntries; }
233   void acquireLQSlot() { ++UsedLQEntries; }
234   void acquireSQSlot() { ++UsedSQEntries; }
235   void releaseLQSlot() { --UsedLQEntries; }
236   void releaseSQSlot() { --UsedSQEntries; }
237 
238   bool assumeNoAlias() const { return NoAlias; }
239 
240   enum Status {
241     LSU_AVAILABLE = 0,
242     LSU_LQUEUE_FULL, // Load Queue unavailable
243     LSU_SQUEUE_FULL  // Store Queue unavailable
244   };
245 
246   /// This method checks the availability of the load/store buffers.
247   ///
248   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
249   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
250   /// not a memory operation.
251   virtual Status isAvailable(const InstRef &IR) const = 0;
252 
253   /// Allocates LS resources for instruction IR.
254   ///
255   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
256   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
257   /// Returns the GroupID associated with this instruction. That value will be
258   /// used to set the LSUTokenID field in class Instruction.
259   virtual unsigned dispatch(const InstRef &IR) = 0;
260 
261   bool isSQEmpty() const { return !UsedSQEntries; }
262   bool isLQEmpty() const { return !UsedLQEntries; }
263   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
264   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
265 
266   bool isValidGroupID(unsigned Index) const {
267     return Index && (Groups.find(Index) != Groups.end());
268   }
269 
270   /// Check if a peviously dispatched instruction IR is now ready for execution.
271   bool isReady(const InstRef &IR) const {
272     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
273     const MemoryGroup &Group = getGroup(GroupID);
274     return Group.isReady();
275   }
276 
277   /// Check if instruction IR only depends on memory instructions that are
278   /// currently executing.
279   bool isPending(const InstRef &IR) const {
280     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
281     const MemoryGroup &Group = getGroup(GroupID);
282     return Group.isPending();
283   }
284 
285   /// Check if instruction IR is still waiting on memory operations, and the
286   /// wait time is still unknown.
287   bool isWaiting(const InstRef &IR) const {
288     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
289     const MemoryGroup &Group = getGroup(GroupID);
290     return Group.isWaiting();
291   }
292 
293   bool hasDependentUsers(const InstRef &IR) const {
294     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
295     const MemoryGroup &Group = getGroup(GroupID);
296     return !Group.isExecuted() && Group.getNumSuccessors();
297   }
298 
299   const MemoryGroup &getGroup(unsigned Index) const {
300     assert(isValidGroupID(Index) && "Group doesn't exist!");
301     return *Groups.find(Index)->second;
302   }
303 
304   MemoryGroup &getGroup(unsigned Index) {
305     assert(isValidGroupID(Index) && "Group doesn't exist!");
306     return *Groups.find(Index)->second;
307   }
308 
309   unsigned createMemoryGroup() {
310     Groups.insert(
311         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
312     return NextGroupID++;
313   }
314 
315   virtual void onInstructionExecuted(const InstRef &IR);
316 
317   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
318   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
319   // By default we conservatively assume that the LDQ receives a load at
320   // dispatch. Loads leave the LDQ at retirement stage.
321   virtual void onInstructionRetired(const InstRef &IR);
322 
323   virtual void onInstructionIssued(const InstRef &IR) {
324     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
325     Groups[GroupID]->onInstructionIssued(IR);
326   }
327 
328   virtual void cycleEvent();
329 
330 #ifndef NDEBUG
331   void dump() const;
332 #endif
333 };
334 
335 /// Default Load/Store Unit (LS Unit) for simulated processors.
336 ///
337 /// Each load (or store) consumes one entry in the load (or store) queue.
338 ///
339 /// Rules are:
340 /// 1) A younger load is allowed to pass an older load only if there are no
341 ///    stores nor barriers in between the two loads.
342 /// 2) An younger store is not allowed to pass an older store.
343 /// 3) A younger store is not allowed to pass an older load.
344 /// 4) A younger load is allowed to pass an older store only if the load does
345 ///    not alias with the store.
346 ///
347 /// This class optimistically assumes that loads don't alias store operations.
348 /// Under this assumption, younger loads are always allowed to pass older
349 /// stores (this would only affects rule 4).
350 /// Essentially, this class doesn't perform any sort alias analysis to
351 /// identify aliasing loads and stores.
352 ///
353 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
354 /// set to `false` by the constructor of LSUnit.
355 ///
356 /// Note that this class doesn't know about the existence of different memory
357 /// types for memory operations (example: write-through, write-combining, etc.).
358 /// Derived classes are responsible for implementing that extra knowledge, and
359 /// provide different sets of rules for loads and stores by overriding method
360 /// `isReady()`.
361 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
362 /// derived class to enable the reordering of non-aliasing store operations.
363 ///
364 /// No assumptions are made by this class on the size of the store buffer.  This
365 /// class doesn't know how to identify cases where store-to-load forwarding may
366 /// occur.
367 ///
368 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
369 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
370 /// cache hierarchy and memory types.
371 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
372 /// scheduling model provides an "optimistic" load-to-use latency (which usually
373 /// matches the load-to-use latency for when there is a hit in the L1D).
374 /// Derived classes may expand this knowledge.
375 ///
376 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
377 /// memory-barrier like instructions.
378 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
379 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
380 /// serializes loads without forcing a flush of the load queue.
381 /// Similarly, instructions that both `mayStore` and have `unmodeled side
382 /// effects` are treated like store barriers. A full memory
383 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
384 /// effects. This is obviously inaccurate, but this is the best that we can do
385 /// at the moment.
386 ///
387 /// Each load/store barrier consumes one entry in the load/store queue. A
388 /// load/store barrier enforces ordering of loads/stores:
389 ///  - A younger load cannot pass a load barrier.
390 ///  - A younger store cannot pass a store barrier.
391 ///
392 /// A younger load has to wait for the memory load barrier to execute.
393 /// A load/store barrier is "executed" when it becomes the oldest entry in
394 /// the load/store queue(s). That also means, all the older loads/stores have
395 /// already been executed.
396 class LSUnit : public LSUnitBase {
397   // This class doesn't know about the latency of a load instruction. So, it
398   // conservatively/pessimistically assumes that the latency of a load opcode
399   // matches the instruction latency.
400   //
401   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
402   // and load/store conflicts, the latency of a load is determined by the depth
403   // of the load pipeline. So, we could use field `LoadLatency` in the
404   // MCSchedModel to model that latency.
405   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
406   // L1D, and it usually already accounts for any extra latency due to data
407   // forwarding.
408   // When doing throughput analysis, `LoadLatency` is likely to
409   // be a better predictor of load latency than instruction latency. This is
410   // particularly true when simulating code with temporal/spatial locality of
411   // memory accesses.
412   // Using `LoadLatency` (instead of the instruction latency) is also expected
413   // to improve the load queue allocation for long latency instructions with
414   // folded memory operands (See PR39829).
415   //
416   // FIXME: On some processors, load/store operations are split into multiple
417   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
418   // not 256-bit data types. So, a 256-bit load is effectively split into two
419   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
420   // simplicity, this class optimistically assumes that a load instruction only
421   // consumes one entry in the LoadQueue.  Similarly, store instructions only
422   // consume a single entry in the StoreQueue.
423   // In future, we should reassess the quality of this design, and consider
424   // alternative approaches that let instructions specify the number of
425   // load/store queue entries which they consume at dispatch stage (See
426   // PR39830).
427   //
428   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
429   // conservatively treated as a store barrier. It forces older store to be
430   // executed before newer stores are issued.
431   //
432   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
433   // conservatively treated as a load barrier. It forces older loads to execute
434   // before newer loads are issued.
435   unsigned CurrentLoadGroupID;
436   unsigned CurrentLoadBarrierGroupID;
437   unsigned CurrentStoreGroupID;
438   unsigned CurrentStoreBarrierGroupID;
439 
440 public:
441   LSUnit(const MCSchedModel &SM)
442       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
443   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
444       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
445   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
446       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
447         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
448         CurrentStoreBarrierGroupID(0) {}
449 
450   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
451   /// accomodate instruction IR.
452   Status isAvailable(const InstRef &IR) const override;
453 
454   /// Allocates LS resources for instruction IR.
455   ///
456   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
457   /// returning LSU_AVAILABLE.
458   ///
459   /// Rules are:
460   /// By default, rules are:
461   /// 1. A store may not pass a previous store.
462   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
463   /// 3. A load may pass a previous load.
464   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
465   /// 5. A load has to wait until an older load barrier is fully executed.
466   /// 6. A store has to wait until an older store barrier is fully executed.
467   unsigned dispatch(const InstRef &IR) override;
468 
469   void onInstructionExecuted(const InstRef &IR) override;
470 };
471 
472 } // namespace mca
473 } // namespace llvm
474 
475 #endif // LLVM_MCA_LSUNIT_H
476