1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE &&
130            !isStrongerThan(FailureOrdering, Ordering));
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
getScope() const158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
getOrdering() const164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
getFailureOrdering() const170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   Optional<SIMemOpInfo> constructFromMIWithMMO(
233       const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "None" otherwise.
241   Optional<SIMemOpInfo> getLoadInfo(
242       const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "None" otherwise.
245   Optional<SIMemOpInfo> getStoreInfo(
246       const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Atomic fence info if \p MI is an atomic fence operation,
249   /// "None" otherwise.
250   Optional<SIMemOpInfo> getAtomicFenceInfo(
251       const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254   /// rmw operation, "None" otherwise.
255   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256       const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262   /// AMDGPU subtarget info.
263   const GCNSubtarget &ST;
264 
265   /// Instruction info.
266   const SIInstrInfo *TII = nullptr;
267 
268   IsaVersion IV;
269 
270   /// Whether to insert cache invalidating instructions.
271   bool InsertCacheInv;
272 
273   SICacheControl(const GCNSubtarget &ST);
274 
275   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276   /// \returns Returns true if \p MI is modified, false otherwise.
277   bool enableNamedBit(const MachineBasicBlock::iterator MI,
278                       AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282   /// Create a cache control for the subtarget \p ST.
283   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285   /// Update \p MI memory load instruction to bypass any caches up to
286   /// the \p Scope memory scope for address spaces \p
287   /// AddrSpace. Return true iff the instruction was modified.
288   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace) const = 0;
291 
292   /// Update \p MI memory store instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296                                       SIAtomicScope Scope,
297                                       SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory read-modify-write instruction to bypass any caches up
300   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301   /// iff the instruction was modified.
302   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303                                     SIAtomicScope Scope,
304                                     SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory instruction of kind \p Op associated with address
307   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308   /// true iff the instruction was modified.
309   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310                                               SIAtomicAddrSpace AddrSpace,
311                                               SIMemOp Op, bool IsVolatile,
312                                               bool IsNonTemporal) const = 0;
313 
314   /// Inserts any necessary instructions at position \p Pos relative
315   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316   /// \p Op associated with address spaces \p AddrSpace have completed. Used
317   /// between memory instructions to enforce the order they become visible as
318   /// observed by other memory instructions executing in memory scope \p Scope.
319   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320   /// address spaces. Returns true iff any instructions inserted.
321   virtual bool insertWait(MachineBasicBlock::iterator &MI,
322                           SIAtomicScope Scope,
323                           SIAtomicAddrSpace AddrSpace,
324                           SIMemOp Op,
325                           bool IsCrossAddrSpaceOrdering,
326                           Position Pos) const = 0;
327 
328   /// Inserts any necessary instructions at position \p Pos relative to
329   /// instruction \p MI to ensure any subsequent memory instructions of this
330   /// thread with address spaces \p AddrSpace will observe the previous memory
331   /// operations by any thread for memory scopes up to memory scope \p Scope .
332   /// Returns true iff any instructions inserted.
333   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334                              SIAtomicScope Scope,
335                              SIAtomicAddrSpace AddrSpace,
336                              Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure previous memory instructions by this thread
340   /// with address spaces \p AddrSpace have completed and can be observed by
341   /// subsequent memory instructions by any thread executing in memory scope \p
342   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343   /// between address spaces. Returns true iff any instructions inserted.
344   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              bool IsCrossAddrSpaceOrdering,
348                              Position Pos) const = 0;
349 
350   /// Virtual destructor to allow derivations to be deleted.
351   virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359   /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const360   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361     return enableNamedBit(MI, AMDGPU::CPol::GLC);
362   }
363 
364   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365   /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const366   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367     return enableNamedBit(MI, AMDGPU::CPol::SLC);
368   }
369 
370 public:
371 
SIGfx6CacheControl(const GCNSubtarget & ST)372   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373 
374   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375                              SIAtomicScope Scope,
376                              SIAtomicAddrSpace AddrSpace) const override;
377 
378   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379                               SIAtomicScope Scope,
380                               SIAtomicAddrSpace AddrSpace) const override;
381 
382   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383                             SIAtomicScope Scope,
384                             SIAtomicAddrSpace AddrSpace) const override;
385 
386   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388                                       bool IsVolatile,
389                                       bool IsNonTemporal) const override;
390 
391   bool insertWait(MachineBasicBlock::iterator &MI,
392                   SIAtomicScope Scope,
393                   SIAtomicAddrSpace AddrSpace,
394                   SIMemOp Op,
395                   bool IsCrossAddrSpaceOrdering,
396                   Position Pos) const override;
397 
398   bool insertAcquire(MachineBasicBlock::iterator &MI,
399                      SIAtomicScope Scope,
400                      SIAtomicAddrSpace AddrSpace,
401                      Position Pos) const override;
402 
403   bool insertRelease(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      bool IsCrossAddrSpaceOrdering,
407                      Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
SIGfx7CacheControl(const GCNSubtarget & ST)413   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414 
415   bool insertAcquire(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
SIGfx90ACacheControl(const GCNSubtarget & ST)425   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
426 
427   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428                              SIAtomicScope Scope,
429                              SIAtomicAddrSpace AddrSpace) const override;
430 
431   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432                               SIAtomicScope Scope,
433                               SIAtomicAddrSpace AddrSpace) const override;
434 
435   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436                             SIAtomicScope Scope,
437                             SIAtomicAddrSpace AddrSpace) const override;
438 
439   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441                                       bool IsVolatile,
442                                       bool IsNonTemporal) const override;
443 
444   bool insertWait(MachineBasicBlock::iterator &MI,
445                   SIAtomicScope Scope,
446                   SIAtomicAddrSpace AddrSpace,
447                   SIMemOp Op,
448                   bool IsCrossAddrSpaceOrdering,
449                   Position Pos) const override;
450 
451   bool insertAcquire(MachineBasicBlock::iterator &MI,
452                      SIAtomicScope Scope,
453                      SIAtomicAddrSpace AddrSpace,
454                      Position Pos) const override;
455 };
456 
457 class SIGfx10CacheControl : public SIGfx7CacheControl {
458 protected:
459 
460   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
461   /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const462   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
463     return enableNamedBit(MI, AMDGPU::CPol::DLC);
464   }
465 
466 public:
467 
SIGfx10CacheControl(const GCNSubtarget & ST)468   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
469 
470   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
471                              SIAtomicScope Scope,
472                              SIAtomicAddrSpace AddrSpace) const override;
473 
474   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
475                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
476                                       bool IsVolatile,
477                                       bool IsNonTemporal) const override;
478 
479   bool insertWait(MachineBasicBlock::iterator &MI,
480                   SIAtomicScope Scope,
481                   SIAtomicAddrSpace AddrSpace,
482                   SIMemOp Op,
483                   bool IsCrossAddrSpaceOrdering,
484                   Position Pos) const override;
485 
486   bool insertAcquire(MachineBasicBlock::iterator &MI,
487                      SIAtomicScope Scope,
488                      SIAtomicAddrSpace AddrSpace,
489                      Position Pos) const override;
490 };
491 
492 class SIMemoryLegalizer final : public MachineFunctionPass {
493 private:
494 
495   /// Cache Control.
496   std::unique_ptr<SICacheControl> CC = nullptr;
497 
498   /// List of atomic pseudo instructions.
499   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
500 
501   /// Return true iff instruction \p MI is a atomic instruction that
502   /// returns a result.
isAtomicRet(const MachineInstr & MI) const503   bool isAtomicRet(const MachineInstr &MI) const {
504     return SIInstrInfo::isAtomicRet(MI);
505   }
506 
507   /// Removes all processed atomic pseudo instructions from the current
508   /// function. Returns true if current function is modified, false otherwise.
509   bool removeAtomicPseudoMIs();
510 
511   /// Expands load operation \p MI. Returns true if instructions are
512   /// added/deleted or \p MI is modified, false otherwise.
513   bool expandLoad(const SIMemOpInfo &MOI,
514                   MachineBasicBlock::iterator &MI);
515   /// Expands store operation \p MI. Returns true if instructions are
516   /// added/deleted or \p MI is modified, false otherwise.
517   bool expandStore(const SIMemOpInfo &MOI,
518                    MachineBasicBlock::iterator &MI);
519   /// Expands atomic fence operation \p MI. Returns true if
520   /// instructions are added/deleted or \p MI is modified, false otherwise.
521   bool expandAtomicFence(const SIMemOpInfo &MOI,
522                          MachineBasicBlock::iterator &MI);
523   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
524   /// instructions are added/deleted or \p MI is modified, false otherwise.
525   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
526                                 MachineBasicBlock::iterator &MI);
527 
528 public:
529   static char ID;
530 
SIMemoryLegalizer()531   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
532 
getAnalysisUsage(AnalysisUsage & AU) const533   void getAnalysisUsage(AnalysisUsage &AU) const override {
534     AU.setPreservesCFG();
535     MachineFunctionPass::getAnalysisUsage(AU);
536   }
537 
getPassName() const538   StringRef getPassName() const override {
539     return PASS_NAME;
540   }
541 
542   bool runOnMachineFunction(MachineFunction &MF) override;
543 };
544 
545 } // end namespace anonymous
546 
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const547 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
548                                       const char *Msg) const {
549   const Function &Func = MI->getParent()->getParent()->getFunction();
550   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
551   Func.getContext().diagnose(Diag);
552 }
553 
554 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const555 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
556                                SIAtomicAddrSpace InstrAddrSpace) const {
557   if (SSID == SyncScope::System)
558     return std::make_tuple(SIAtomicScope::SYSTEM,
559                            SIAtomicAddrSpace::ATOMIC,
560                            true);
561   if (SSID == MMI->getAgentSSID())
562     return std::make_tuple(SIAtomicScope::AGENT,
563                            SIAtomicAddrSpace::ATOMIC,
564                            true);
565   if (SSID == MMI->getWorkgroupSSID())
566     return std::make_tuple(SIAtomicScope::WORKGROUP,
567                            SIAtomicAddrSpace::ATOMIC,
568                            true);
569   if (SSID == MMI->getWavefrontSSID())
570     return std::make_tuple(SIAtomicScope::WAVEFRONT,
571                            SIAtomicAddrSpace::ATOMIC,
572                            true);
573   if (SSID == SyncScope::SingleThread)
574     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
575                            SIAtomicAddrSpace::ATOMIC,
576                            true);
577   if (SSID == MMI->getSystemOneAddressSpaceSSID())
578     return std::make_tuple(SIAtomicScope::SYSTEM,
579                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
580                            false);
581   if (SSID == MMI->getAgentOneAddressSpaceSSID())
582     return std::make_tuple(SIAtomicScope::AGENT,
583                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
584                            false);
585   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
586     return std::make_tuple(SIAtomicScope::WORKGROUP,
587                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
588                            false);
589   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
590     return std::make_tuple(SIAtomicScope::WAVEFRONT,
591                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
592                            false);
593   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
594     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
595                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
596                            false);
597   return None;
598 }
599 
toSIAtomicAddrSpace(unsigned AS) const600 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
601   if (AS == AMDGPUAS::FLAT_ADDRESS)
602     return SIAtomicAddrSpace::FLAT;
603   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
604     return SIAtomicAddrSpace::GLOBAL;
605   if (AS == AMDGPUAS::LOCAL_ADDRESS)
606     return SIAtomicAddrSpace::LDS;
607   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
608     return SIAtomicAddrSpace::SCRATCH;
609   if (AS == AMDGPUAS::REGION_ADDRESS)
610     return SIAtomicAddrSpace::GDS;
611 
612   return SIAtomicAddrSpace::OTHER;
613 }
614 
SIMemOpAccess(MachineFunction & MF)615 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
616   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
617 }
618 
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const619 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
620     const MachineBasicBlock::iterator &MI) const {
621   assert(MI->getNumMemOperands() > 0);
622 
623   SyncScope::ID SSID = SyncScope::SingleThread;
624   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
625   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
626   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
627   bool IsNonTemporal = true;
628   bool IsVolatile = false;
629 
630   // Validator should check whether or not MMOs cover the entire set of
631   // locations accessed by the memory instruction.
632   for (const auto &MMO : MI->memoperands()) {
633     IsNonTemporal &= MMO->isNonTemporal();
634     IsVolatile |= MMO->isVolatile();
635     InstrAddrSpace |=
636       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
637     AtomicOrdering OpOrdering = MMO->getOrdering();
638     if (OpOrdering != AtomicOrdering::NotAtomic) {
639       const auto &IsSyncScopeInclusion =
640           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
641       if (!IsSyncScopeInclusion) {
642         reportUnsupported(MI,
643           "Unsupported non-inclusive atomic synchronization scope");
644         return None;
645       }
646 
647       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
648       Ordering =
649           isStrongerThan(Ordering, OpOrdering) ?
650               Ordering : MMO->getOrdering();
651       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
652              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
653       FailureOrdering =
654           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
655               FailureOrdering : MMO->getFailureOrdering();
656     }
657   }
658 
659   SIAtomicScope Scope = SIAtomicScope::NONE;
660   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
661   bool IsCrossAddressSpaceOrdering = false;
662   if (Ordering != AtomicOrdering::NotAtomic) {
663     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
664     if (!ScopeOrNone) {
665       reportUnsupported(MI, "Unsupported atomic synchronization scope");
666       return None;
667     }
668     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
669       ScopeOrNone.getValue();
670     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
671         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
672         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
673       reportUnsupported(MI, "Unsupported atomic address space");
674       return None;
675     }
676   }
677   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
678                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
679                      IsNonTemporal);
680 }
681 
getLoadInfo(const MachineBasicBlock::iterator & MI) const682 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
683     const MachineBasicBlock::iterator &MI) const {
684   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
685 
686   if (!(MI->mayLoad() && !MI->mayStore()))
687     return None;
688 
689   // Be conservative if there are no memory operands.
690   if (MI->getNumMemOperands() == 0)
691     return SIMemOpInfo();
692 
693   return constructFromMIWithMMO(MI);
694 }
695 
getStoreInfo(const MachineBasicBlock::iterator & MI) const696 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
697     const MachineBasicBlock::iterator &MI) const {
698   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
699 
700   if (!(!MI->mayLoad() && MI->mayStore()))
701     return None;
702 
703   // Be conservative if there are no memory operands.
704   if (MI->getNumMemOperands() == 0)
705     return SIMemOpInfo();
706 
707   return constructFromMIWithMMO(MI);
708 }
709 
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const710 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
711     const MachineBasicBlock::iterator &MI) const {
712   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
713 
714   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
715     return None;
716 
717   AtomicOrdering Ordering =
718     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
719 
720   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
721   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
722   if (!ScopeOrNone) {
723     reportUnsupported(MI, "Unsupported atomic synchronization scope");
724     return None;
725   }
726 
727   SIAtomicScope Scope = SIAtomicScope::NONE;
728   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
729   bool IsCrossAddressSpaceOrdering = false;
730   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
731     ScopeOrNone.getValue();
732 
733   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
734       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
735     reportUnsupported(MI, "Unsupported atomic address space");
736     return None;
737   }
738 
739   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
740                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
741 }
742 
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const743 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
744     const MachineBasicBlock::iterator &MI) const {
745   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
746 
747   if (!(MI->mayLoad() && MI->mayStore()))
748     return None;
749 
750   // Be conservative if there are no memory operands.
751   if (MI->getNumMemOperands() == 0)
752     return SIMemOpInfo();
753 
754   return constructFromMIWithMMO(MI);
755 }
756 
SICacheControl(const GCNSubtarget & ST)757 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
758   TII = ST.getInstrInfo();
759   IV = getIsaVersion(ST.getCPU());
760   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
761 }
762 
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const763 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
764                                     AMDGPU::CPol::CPol Bit) const {
765   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
766   if (!CPol)
767     return false;
768 
769   CPol->setImm(CPol->getImm() | Bit);
770   return true;
771 }
772 
773 /* static */
create(const GCNSubtarget & ST)774 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
775   GCNSubtarget::Generation Generation = ST.getGeneration();
776   if (ST.hasGFX90AInsts())
777     return std::make_unique<SIGfx90ACacheControl>(ST);
778   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
779     return std::make_unique<SIGfx6CacheControl>(ST);
780   if (Generation < AMDGPUSubtarget::GFX10)
781     return std::make_unique<SIGfx7CacheControl>(ST);
782   return std::make_unique<SIGfx10CacheControl>(ST);
783 }
784 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const785 bool SIGfx6CacheControl::enableLoadCacheBypass(
786     const MachineBasicBlock::iterator &MI,
787     SIAtomicScope Scope,
788     SIAtomicAddrSpace AddrSpace) const {
789   assert(MI->mayLoad() && !MI->mayStore());
790   bool Changed = false;
791 
792   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
793     switch (Scope) {
794     case SIAtomicScope::SYSTEM:
795     case SIAtomicScope::AGENT:
796       Changed |= enableGLCBit(MI);
797       break;
798     case SIAtomicScope::WORKGROUP:
799     case SIAtomicScope::WAVEFRONT:
800     case SIAtomicScope::SINGLETHREAD:
801       // No cache to bypass.
802       break;
803     default:
804       llvm_unreachable("Unsupported synchronization scope");
805     }
806   }
807 
808   /// The scratch address space does not need the global memory caches
809   /// to be bypassed as all memory operations by the same thread are
810   /// sequentially consistent, and no other thread can access scratch
811   /// memory.
812 
813   /// Other address spaces do not have a cache.
814 
815   return Changed;
816 }
817 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const818 bool SIGfx6CacheControl::enableStoreCacheBypass(
819     const MachineBasicBlock::iterator &MI,
820     SIAtomicScope Scope,
821     SIAtomicAddrSpace AddrSpace) const {
822   assert(!MI->mayLoad() && MI->mayStore());
823   bool Changed = false;
824 
825   /// The L1 cache is write through so does not need to be bypassed. There is no
826   /// bypass control for the L2 cache at the isa level.
827 
828   return Changed;
829 }
830 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const831 bool SIGfx6CacheControl::enableRMWCacheBypass(
832     const MachineBasicBlock::iterator &MI,
833     SIAtomicScope Scope,
834     SIAtomicAddrSpace AddrSpace) const {
835   assert(MI->mayLoad() && MI->mayStore());
836   bool Changed = false;
837 
838   /// The L1 cache is write through so does not need to be bypassed. There is no
839   /// bypass control for the L2 cache at the isa level.
840 
841   return Changed;
842 }
843 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const844 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
845     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
846     bool IsVolatile, bool IsNonTemporal) const {
847   // Only handle load and store, not atomic read-modify-write insructions. The
848   // latter use glc to indicate if the atomic returns a result and so must not
849   // be used for cache control.
850   assert(MI->mayLoad() ^ MI->mayStore());
851 
852   // Only update load and store, not LLVM IR atomic read-modify-write
853   // instructions. The latter are always marked as volatile so cannot sensibly
854   // handle it as do not want to pessimize all atomics. Also they do not support
855   // the nontemporal attribute.
856   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
857 
858   bool Changed = false;
859 
860   if (IsVolatile) {
861     if (Op == SIMemOp::LOAD)
862       Changed |= enableGLCBit(MI);
863 
864     // Ensure operation has completed at system scope to cause all volatile
865     // operations to be visible outside the program in a global order. Do not
866     // request cross address space as only the global address space can be
867     // observable outside the program, so no need to cause a waitcnt for LDS
868     // address space operations.
869     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
870                           Position::AFTER);
871 
872     return Changed;
873   }
874 
875   if (IsNonTemporal) {
876     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
877     Changed |= enableGLCBit(MI);
878     Changed |= enableSLCBit(MI);
879     return Changed;
880   }
881 
882   return Changed;
883 }
884 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const885 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
886                                     SIAtomicScope Scope,
887                                     SIAtomicAddrSpace AddrSpace,
888                                     SIMemOp Op,
889                                     bool IsCrossAddrSpaceOrdering,
890                                     Position Pos) const {
891   bool Changed = false;
892 
893   MachineBasicBlock &MBB = *MI->getParent();
894   DebugLoc DL = MI->getDebugLoc();
895 
896   if (Pos == Position::AFTER)
897     ++MI;
898 
899   bool VMCnt = false;
900   bool LGKMCnt = false;
901 
902   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
903       SIAtomicAddrSpace::NONE) {
904     switch (Scope) {
905     case SIAtomicScope::SYSTEM:
906     case SIAtomicScope::AGENT:
907       VMCnt |= true;
908       break;
909     case SIAtomicScope::WORKGROUP:
910     case SIAtomicScope::WAVEFRONT:
911     case SIAtomicScope::SINGLETHREAD:
912       // The L1 cache keeps all memory operations in order for
913       // wavefronts in the same work-group.
914       break;
915     default:
916       llvm_unreachable("Unsupported synchronization scope");
917     }
918   }
919 
920   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
921     switch (Scope) {
922     case SIAtomicScope::SYSTEM:
923     case SIAtomicScope::AGENT:
924     case SIAtomicScope::WORKGROUP:
925       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
926       // not needed as LDS operations for all waves are executed in a total
927       // global ordering as observed by all waves. Required if also
928       // synchronizing with global/GDS memory as LDS operations could be
929       // reordered with respect to later global/GDS memory operations of the
930       // same wave.
931       LGKMCnt |= IsCrossAddrSpaceOrdering;
932       break;
933     case SIAtomicScope::WAVEFRONT:
934     case SIAtomicScope::SINGLETHREAD:
935       // The LDS keeps all memory operations in order for
936       // the same wavesfront.
937       break;
938     default:
939       llvm_unreachable("Unsupported synchronization scope");
940     }
941   }
942 
943   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
944     switch (Scope) {
945     case SIAtomicScope::SYSTEM:
946     case SIAtomicScope::AGENT:
947       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
948       // is not needed as GDS operations for all waves are executed in a total
949       // global ordering as observed by all waves. Required if also
950       // synchronizing with global/LDS memory as GDS operations could be
951       // reordered with respect to later global/LDS memory operations of the
952       // same wave.
953       LGKMCnt |= IsCrossAddrSpaceOrdering;
954       break;
955     case SIAtomicScope::WORKGROUP:
956     case SIAtomicScope::WAVEFRONT:
957     case SIAtomicScope::SINGLETHREAD:
958       // The GDS keeps all memory operations in order for
959       // the same work-group.
960       break;
961     default:
962       llvm_unreachable("Unsupported synchronization scope");
963     }
964   }
965 
966   if (VMCnt || LGKMCnt) {
967     unsigned WaitCntImmediate =
968       AMDGPU::encodeWaitcnt(IV,
969                             VMCnt ? 0 : getVmcntBitMask(IV),
970                             getExpcntBitMask(IV),
971                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
972     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
973     Changed = true;
974   }
975 
976   if (Pos == Position::AFTER)
977     --MI;
978 
979   return Changed;
980 }
981 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const982 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
983                                        SIAtomicScope Scope,
984                                        SIAtomicAddrSpace AddrSpace,
985                                        Position Pos) const {
986   if (!InsertCacheInv)
987     return false;
988 
989   bool Changed = false;
990 
991   MachineBasicBlock &MBB = *MI->getParent();
992   DebugLoc DL = MI->getDebugLoc();
993 
994   if (Pos == Position::AFTER)
995     ++MI;
996 
997   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
998     switch (Scope) {
999     case SIAtomicScope::SYSTEM:
1000     case SIAtomicScope::AGENT:
1001       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1002       Changed = true;
1003       break;
1004     case SIAtomicScope::WORKGROUP:
1005     case SIAtomicScope::WAVEFRONT:
1006     case SIAtomicScope::SINGLETHREAD:
1007       // No cache to invalidate.
1008       break;
1009     default:
1010       llvm_unreachable("Unsupported synchronization scope");
1011     }
1012   }
1013 
1014   /// The scratch address space does not need the global memory cache
1015   /// to be flushed as all memory operations by the same thread are
1016   /// sequentially consistent, and no other thread can access scratch
1017   /// memory.
1018 
1019   /// Other address spaces do not have a cache.
1020 
1021   if (Pos == Position::AFTER)
1022     --MI;
1023 
1024   return Changed;
1025 }
1026 
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1027 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1028                                        SIAtomicScope Scope,
1029                                        SIAtomicAddrSpace AddrSpace,
1030                                        bool IsCrossAddrSpaceOrdering,
1031                                        Position Pos) const {
1032     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1033                       IsCrossAddrSpaceOrdering, Pos);
1034 }
1035 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1036 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1037                                        SIAtomicScope Scope,
1038                                        SIAtomicAddrSpace AddrSpace,
1039                                        Position Pos) const {
1040   if (!InsertCacheInv)
1041     return false;
1042 
1043   bool Changed = false;
1044 
1045   MachineBasicBlock &MBB = *MI->getParent();
1046   DebugLoc DL = MI->getDebugLoc();
1047 
1048   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1049 
1050   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1051                                     ? AMDGPU::BUFFER_WBINVL1
1052                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1053 
1054   if (Pos == Position::AFTER)
1055     ++MI;
1056 
1057   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1058     switch (Scope) {
1059     case SIAtomicScope::SYSTEM:
1060     case SIAtomicScope::AGENT:
1061       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1062       Changed = true;
1063       break;
1064     case SIAtomicScope::WORKGROUP:
1065     case SIAtomicScope::WAVEFRONT:
1066     case SIAtomicScope::SINGLETHREAD:
1067       // No cache to invalidate.
1068       break;
1069     default:
1070       llvm_unreachable("Unsupported synchronization scope");
1071     }
1072   }
1073 
1074   /// The scratch address space does not need the global memory cache
1075   /// to be flushed as all memory operations by the same thread are
1076   /// sequentially consistent, and no other thread can access scratch
1077   /// memory.
1078 
1079   /// Other address spaces do not have a cache.
1080 
1081   if (Pos == Position::AFTER)
1082     --MI;
1083 
1084   return Changed;
1085 }
1086 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1087 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1088     const MachineBasicBlock::iterator &MI,
1089     SIAtomicScope Scope,
1090     SIAtomicAddrSpace AddrSpace) const {
1091   assert(MI->mayLoad() && !MI->mayStore());
1092   bool Changed = false;
1093 
1094   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1095     switch (Scope) {
1096     case SIAtomicScope::SYSTEM:
1097     case SIAtomicScope::AGENT:
1098       Changed |= enableGLCBit(MI);
1099       break;
1100     case SIAtomicScope::WORKGROUP:
1101       // In threadgroup split mode the waves of a work-group can be executing on
1102       // different CUs. Therefore need to bypass the L1 which is per CU.
1103       // Otherwise in non-threadgroup split mode all waves of a work-group are
1104       // on the same CU, and so the L1 does not need to be bypassed.
1105       if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1106       break;
1107     case SIAtomicScope::WAVEFRONT:
1108     case SIAtomicScope::SINGLETHREAD:
1109       // No cache to bypass.
1110       break;
1111     default:
1112       llvm_unreachable("Unsupported synchronization scope");
1113     }
1114   }
1115 
1116   /// The scratch address space does not need the global memory caches
1117   /// to be bypassed as all memory operations by the same thread are
1118   /// sequentially consistent, and no other thread can access scratch
1119   /// memory.
1120 
1121   /// Other address spaces do not have a cache.
1122 
1123   return Changed;
1124 }
1125 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1126 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1127     const MachineBasicBlock::iterator &MI,
1128     SIAtomicScope Scope,
1129     SIAtomicAddrSpace AddrSpace) const {
1130   assert(!MI->mayLoad() && MI->mayStore());
1131   bool Changed = false;
1132 
1133   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1134     switch (Scope) {
1135     case SIAtomicScope::SYSTEM:
1136     case SIAtomicScope::AGENT:
1137       /// Do not set glc for store atomic operations as they implicitly write
1138       /// through the L1 cache.
1139       break;
1140     case SIAtomicScope::WORKGROUP:
1141     case SIAtomicScope::WAVEFRONT:
1142     case SIAtomicScope::SINGLETHREAD:
1143       // No cache to bypass. Store atomics implicitly write through the L1
1144       // cache.
1145       break;
1146     default:
1147       llvm_unreachable("Unsupported synchronization scope");
1148     }
1149   }
1150 
1151   /// The scratch address space does not need the global memory caches
1152   /// to be bypassed as all memory operations by the same thread are
1153   /// sequentially consistent, and no other thread can access scratch
1154   /// memory.
1155 
1156   /// Other address spaces do not have a cache.
1157 
1158   return Changed;
1159 }
1160 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1161 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1162     const MachineBasicBlock::iterator &MI,
1163     SIAtomicScope Scope,
1164     SIAtomicAddrSpace AddrSpace) const {
1165   assert(MI->mayLoad() && MI->mayStore());
1166   bool Changed = false;
1167 
1168   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1169     switch (Scope) {
1170     case SIAtomicScope::SYSTEM:
1171     case SIAtomicScope::AGENT:
1172       /// Do not set glc for RMW atomic operations as they implicitly bypass
1173       /// the L1 cache, and the glc bit is instead used to indicate if they are
1174       /// return or no-return.
1175       break;
1176     case SIAtomicScope::WORKGROUP:
1177     case SIAtomicScope::WAVEFRONT:
1178     case SIAtomicScope::SINGLETHREAD:
1179       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1180       break;
1181     default:
1182       llvm_unreachable("Unsupported synchronization scope");
1183     }
1184   }
1185 
1186   return Changed;
1187 }
1188 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1189 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1190     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1191     bool IsVolatile, bool IsNonTemporal) const {
1192   // Only handle load and store, not atomic read-modify-write insructions. The
1193   // latter use glc to indicate if the atomic returns a result and so must not
1194   // be used for cache control.
1195   assert(MI->mayLoad() ^ MI->mayStore());
1196 
1197   // Only update load and store, not LLVM IR atomic read-modify-write
1198   // instructions. The latter are always marked as volatile so cannot sensibly
1199   // handle it as do not want to pessimize all atomics. Also they do not support
1200   // the nontemporal attribute.
1201   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1202 
1203   bool Changed = false;
1204 
1205   if (IsVolatile) {
1206     if (Op == SIMemOp::LOAD) {
1207       Changed |= enableGLCBit(MI);
1208     }
1209 
1210     // Ensure operation has completed at system scope to cause all volatile
1211     // operations to be visible outside the program in a global order. Do not
1212     // request cross address space as only the global address space can be
1213     // observable outside the program, so no need to cause a waitcnt for LDS
1214     // address space operations.
1215     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1216                           Position::AFTER);
1217 
1218     return Changed;
1219   }
1220 
1221   if (IsNonTemporal) {
1222     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1223     Changed |= enableGLCBit(MI);
1224     Changed |= enableSLCBit(MI);
1225     return Changed;
1226   }
1227 
1228   return Changed;
1229 }
1230 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1231 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1232                                       SIAtomicScope Scope,
1233                                       SIAtomicAddrSpace AddrSpace,
1234                                       SIMemOp Op,
1235                                       bool IsCrossAddrSpaceOrdering,
1236                                       Position Pos) const {
1237   if (ST.isTgSplitEnabled()) {
1238     // In threadgroup split mode the waves of a work-group can be executing on
1239     // different CUs. Therefore need to wait for global or GDS memory operations
1240     // to complete to ensure they are visible to waves in the other CUs.
1241     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1242     // the same CU, so no need to wait for global memory as all waves in the
1243     // work-group access the same the L1, nor wait for GDS as access are ordered
1244     // on a CU.
1245     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1246                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1247         (Scope == SIAtomicScope::WORKGROUP)) {
1248       // Same as GFX7 using agent scope.
1249       Scope = SIAtomicScope::AGENT;
1250     }
1251     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1252     // LDS memory operations.
1253     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1254   }
1255   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1256                                         IsCrossAddrSpaceOrdering, Pos);
1257 }
1258 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1259 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1260                                          SIAtomicScope Scope,
1261                                          SIAtomicAddrSpace AddrSpace,
1262                                          Position Pos) const {
1263   if (!InsertCacheInv)
1264     return false;
1265 
1266   bool Changed = false;
1267 
1268   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1269     switch (Scope) {
1270     case SIAtomicScope::SYSTEM:
1271     case SIAtomicScope::AGENT:
1272       // Same as GFX7.
1273       break;
1274     case SIAtomicScope::WORKGROUP:
1275       // In threadgroup split mode the waves of a work-group can be executing on
1276       // different CUs. Therefore need to invalidate the L1 which is per CU.
1277       // Otherwise in non-threadgroup split mode all waves of a work-group are
1278       // on the same CU, and so the L1 does not need to be invalidated.
1279       if (ST.isTgSplitEnabled()) {
1280         // Same as GFX7 using agent scope.
1281         Scope = SIAtomicScope::AGENT;
1282       }
1283       break;
1284     case SIAtomicScope::WAVEFRONT:
1285     case SIAtomicScope::SINGLETHREAD:
1286       // Same as GFX7.
1287       break;
1288     default:
1289       llvm_unreachable("Unsupported synchronization scope");
1290     }
1291   }
1292 
1293   /// The scratch address space does not need the global memory cache
1294   /// to be flushed as all memory operations by the same thread are
1295   /// sequentially consistent, and no other thread can access scratch
1296   /// memory.
1297 
1298   /// Other address spaces do not have a cache.
1299 
1300   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1301 
1302   return Changed;
1303 }
1304 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1305 bool SIGfx10CacheControl::enableLoadCacheBypass(
1306     const MachineBasicBlock::iterator &MI,
1307     SIAtomicScope Scope,
1308     SIAtomicAddrSpace AddrSpace) const {
1309   assert(MI->mayLoad() && !MI->mayStore());
1310   bool Changed = false;
1311 
1312   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1313     /// TODO Do not set glc for rmw atomic operations as they
1314     /// implicitly bypass the L0/L1 caches.
1315 
1316     switch (Scope) {
1317     case SIAtomicScope::SYSTEM:
1318     case SIAtomicScope::AGENT:
1319       Changed |= enableGLCBit(MI);
1320       Changed |= enableDLCBit(MI);
1321       break;
1322     case SIAtomicScope::WORKGROUP:
1323       // In WGP mode the waves of a work-group can be executing on either CU of
1324       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1325       // CU mode all waves of a work-group are on the same CU, and so the L0
1326       // does not need to be bypassed.
1327       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1328       break;
1329     case SIAtomicScope::WAVEFRONT:
1330     case SIAtomicScope::SINGLETHREAD:
1331       // No cache to bypass.
1332       break;
1333     default:
1334       llvm_unreachable("Unsupported synchronization scope");
1335     }
1336   }
1337 
1338   /// The scratch address space does not need the global memory caches
1339   /// to be bypassed as all memory operations by the same thread are
1340   /// sequentially consistent, and no other thread can access scratch
1341   /// memory.
1342 
1343   /// Other address spaces do not have a cache.
1344 
1345   return Changed;
1346 }
1347 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1348 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1349     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1350     bool IsVolatile, bool IsNonTemporal) const {
1351 
1352   // Only handle load and store, not atomic read-modify-write insructions. The
1353   // latter use glc to indicate if the atomic returns a result and so must not
1354   // be used for cache control.
1355   assert(MI->mayLoad() ^ MI->mayStore());
1356 
1357   // Only update load and store, not LLVM IR atomic read-modify-write
1358   // instructions. The latter are always marked as volatile so cannot sensibly
1359   // handle it as do not want to pessimize all atomics. Also they do not support
1360   // the nontemporal attribute.
1361   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1362 
1363   bool Changed = false;
1364 
1365   if (IsVolatile) {
1366 
1367     if (Op == SIMemOp::LOAD) {
1368       Changed |= enableGLCBit(MI);
1369       Changed |= enableDLCBit(MI);
1370     }
1371 
1372     // Ensure operation has completed at system scope to cause all volatile
1373     // operations to be visible outside the program in a global order. Do not
1374     // request cross address space as only the global address space can be
1375     // observable outside the program, so no need to cause a waitcnt for LDS
1376     // address space operations.
1377     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1378                           Position::AFTER);
1379     return Changed;
1380   }
1381 
1382   if (IsNonTemporal) {
1383     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1384     Changed |= enableSLCBit(MI);
1385     return Changed;
1386   }
1387 
1388   return Changed;
1389 }
1390 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1391 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1392                                      SIAtomicScope Scope,
1393                                      SIAtomicAddrSpace AddrSpace,
1394                                      SIMemOp Op,
1395                                      bool IsCrossAddrSpaceOrdering,
1396                                      Position Pos) const {
1397   bool Changed = false;
1398 
1399   MachineBasicBlock &MBB = *MI->getParent();
1400   DebugLoc DL = MI->getDebugLoc();
1401 
1402   if (Pos == Position::AFTER)
1403     ++MI;
1404 
1405   bool VMCnt = false;
1406   bool VSCnt = false;
1407   bool LGKMCnt = false;
1408 
1409   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1410       SIAtomicAddrSpace::NONE) {
1411     switch (Scope) {
1412     case SIAtomicScope::SYSTEM:
1413     case SIAtomicScope::AGENT:
1414       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1415         VMCnt |= true;
1416       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1417         VSCnt |= true;
1418       break;
1419     case SIAtomicScope::WORKGROUP:
1420       // In WGP mode the waves of a work-group can be executing on either CU of
1421       // the WGP. Therefore need to wait for operations to complete to ensure
1422       // they are visible to waves in the other CU as the L0 is per CU.
1423       // Otherwise in CU mode and all waves of a work-group are on the same CU
1424       // which shares the same L0.
1425       if (!ST.isCuModeEnabled()) {
1426         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1427           VMCnt |= true;
1428         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1429           VSCnt |= true;
1430       }
1431       break;
1432     case SIAtomicScope::WAVEFRONT:
1433     case SIAtomicScope::SINGLETHREAD:
1434       // The L0 cache keeps all memory operations in order for
1435       // work-items in the same wavefront.
1436       break;
1437     default:
1438       llvm_unreachable("Unsupported synchronization scope");
1439     }
1440   }
1441 
1442   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1443     switch (Scope) {
1444     case SIAtomicScope::SYSTEM:
1445     case SIAtomicScope::AGENT:
1446     case SIAtomicScope::WORKGROUP:
1447       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1448       // not needed as LDS operations for all waves are executed in a total
1449       // global ordering as observed by all waves. Required if also
1450       // synchronizing with global/GDS memory as LDS operations could be
1451       // reordered with respect to later global/GDS memory operations of the
1452       // same wave.
1453       LGKMCnt |= IsCrossAddrSpaceOrdering;
1454       break;
1455     case SIAtomicScope::WAVEFRONT:
1456     case SIAtomicScope::SINGLETHREAD:
1457       // The LDS keeps all memory operations in order for
1458       // the same wavesfront.
1459       break;
1460     default:
1461       llvm_unreachable("Unsupported synchronization scope");
1462     }
1463   }
1464 
1465   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1466     switch (Scope) {
1467     case SIAtomicScope::SYSTEM:
1468     case SIAtomicScope::AGENT:
1469       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1470       // is not needed as GDS operations for all waves are executed in a total
1471       // global ordering as observed by all waves. Required if also
1472       // synchronizing with global/LDS memory as GDS operations could be
1473       // reordered with respect to later global/LDS memory operations of the
1474       // same wave.
1475       LGKMCnt |= IsCrossAddrSpaceOrdering;
1476       break;
1477     case SIAtomicScope::WORKGROUP:
1478     case SIAtomicScope::WAVEFRONT:
1479     case SIAtomicScope::SINGLETHREAD:
1480       // The GDS keeps all memory operations in order for
1481       // the same work-group.
1482       break;
1483     default:
1484       llvm_unreachable("Unsupported synchronization scope");
1485     }
1486   }
1487 
1488   if (VMCnt || LGKMCnt) {
1489     unsigned WaitCntImmediate =
1490       AMDGPU::encodeWaitcnt(IV,
1491                             VMCnt ? 0 : getVmcntBitMask(IV),
1492                             getExpcntBitMask(IV),
1493                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1494     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1495     Changed = true;
1496   }
1497 
1498   if (VSCnt) {
1499     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1500       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1501       .addImm(0);
1502     Changed = true;
1503   }
1504 
1505   if (Pos == Position::AFTER)
1506     --MI;
1507 
1508   return Changed;
1509 }
1510 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1511 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1512                                         SIAtomicScope Scope,
1513                                         SIAtomicAddrSpace AddrSpace,
1514                                         Position Pos) const {
1515   if (!InsertCacheInv)
1516     return false;
1517 
1518   bool Changed = false;
1519 
1520   MachineBasicBlock &MBB = *MI->getParent();
1521   DebugLoc DL = MI->getDebugLoc();
1522 
1523   if (Pos == Position::AFTER)
1524     ++MI;
1525 
1526   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1527     switch (Scope) {
1528     case SIAtomicScope::SYSTEM:
1529     case SIAtomicScope::AGENT:
1530       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1531       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1532       Changed = true;
1533       break;
1534     case SIAtomicScope::WORKGROUP:
1535       // In WGP mode the waves of a work-group can be executing on either CU of
1536       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1537       // in CU mode and all waves of a work-group are on the same CU, and so the
1538       // L0 does not need to be invalidated.
1539       if (!ST.isCuModeEnabled()) {
1540         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1541         Changed = true;
1542       }
1543       break;
1544     case SIAtomicScope::WAVEFRONT:
1545     case SIAtomicScope::SINGLETHREAD:
1546       // No cache to invalidate.
1547       break;
1548     default:
1549       llvm_unreachable("Unsupported synchronization scope");
1550     }
1551   }
1552 
1553   /// The scratch address space does not need the global memory cache
1554   /// to be flushed as all memory operations by the same thread are
1555   /// sequentially consistent, and no other thread can access scratch
1556   /// memory.
1557 
1558   /// Other address spaces do not have a cache.
1559 
1560   if (Pos == Position::AFTER)
1561     --MI;
1562 
1563   return Changed;
1564 }
1565 
removeAtomicPseudoMIs()1566 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1567   if (AtomicPseudoMIs.empty())
1568     return false;
1569 
1570   for (auto &MI : AtomicPseudoMIs)
1571     MI->eraseFromParent();
1572 
1573   AtomicPseudoMIs.clear();
1574   return true;
1575 }
1576 
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1577 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1578                                    MachineBasicBlock::iterator &MI) {
1579   assert(MI->mayLoad() && !MI->mayStore());
1580 
1581   bool Changed = false;
1582 
1583   if (MOI.isAtomic()) {
1584     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1585         MOI.getOrdering() == AtomicOrdering::Acquire ||
1586         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1587       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1588                                            MOI.getOrderingAddrSpace());
1589     }
1590 
1591     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1592       Changed |= CC->insertWait(MI, MOI.getScope(),
1593                                 MOI.getOrderingAddrSpace(),
1594                                 SIMemOp::LOAD | SIMemOp::STORE,
1595                                 MOI.getIsCrossAddressSpaceOrdering(),
1596                                 Position::BEFORE);
1597 
1598     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1599         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1600       Changed |= CC->insertWait(MI, MOI.getScope(),
1601                                 MOI.getInstrAddrSpace(),
1602                                 SIMemOp::LOAD,
1603                                 MOI.getIsCrossAddressSpaceOrdering(),
1604                                 Position::AFTER);
1605       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1606                                    MOI.getOrderingAddrSpace(),
1607                                    Position::AFTER);
1608     }
1609 
1610     return Changed;
1611   }
1612 
1613   // Atomic instructions already bypass caches to the scope specified by the
1614   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1615   // need additional treatment.
1616   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1617                                                 SIMemOp::LOAD, MOI.isVolatile(),
1618                                                 MOI.isNonTemporal());
1619   return Changed;
1620 }
1621 
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1622 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1623                                     MachineBasicBlock::iterator &MI) {
1624   assert(!MI->mayLoad() && MI->mayStore());
1625 
1626   bool Changed = false;
1627 
1628   if (MOI.isAtomic()) {
1629     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1630         MOI.getOrdering() == AtomicOrdering::Release ||
1631         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1632       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1633                                             MOI.getOrderingAddrSpace());
1634     }
1635 
1636     if (MOI.getOrdering() == AtomicOrdering::Release ||
1637         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1638       Changed |= CC->insertRelease(MI, MOI.getScope(),
1639                                    MOI.getOrderingAddrSpace(),
1640                                    MOI.getIsCrossAddressSpaceOrdering(),
1641                                    Position::BEFORE);
1642 
1643     return Changed;
1644   }
1645 
1646   // Atomic instructions already bypass caches to the scope specified by the
1647   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1648   // need additional treatment.
1649   Changed |= CC->enableVolatileAndOrNonTemporal(
1650       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1651       MOI.isNonTemporal());
1652   return Changed;
1653 }
1654 
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1655 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1656                                           MachineBasicBlock::iterator &MI) {
1657   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1658 
1659   AtomicPseudoMIs.push_back(MI);
1660   bool Changed = false;
1661 
1662   if (MOI.isAtomic()) {
1663     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1664         MOI.getOrdering() == AtomicOrdering::Release ||
1665         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1666         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1667       /// TODO: This relies on a barrier always generating a waitcnt
1668       /// for LDS to ensure it is not reordered with the completion of
1669       /// the proceeding LDS operations. If barrier had a memory
1670       /// ordering and memory scope, then library does not need to
1671       /// generate a fence. Could add support in this file for
1672       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1673       /// adding S_WAITCNT before a S_BARRIER.
1674       Changed |= CC->insertRelease(MI, MOI.getScope(),
1675                                    MOI.getOrderingAddrSpace(),
1676                                    MOI.getIsCrossAddressSpaceOrdering(),
1677                                    Position::BEFORE);
1678 
1679     // TODO: If both release and invalidate are happening they could be combined
1680     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1681     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1682     // track cache invalidate and write back instructions.
1683 
1684     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1685         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1686         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1687       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1688                                    MOI.getOrderingAddrSpace(),
1689                                    Position::BEFORE);
1690 
1691     return Changed;
1692   }
1693 
1694   return Changed;
1695 }
1696 
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1697 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1698   MachineBasicBlock::iterator &MI) {
1699   assert(MI->mayLoad() && MI->mayStore());
1700 
1701   bool Changed = false;
1702 
1703   if (MOI.isAtomic()) {
1704     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1705         MOI.getOrdering() == AtomicOrdering::Acquire ||
1706         MOI.getOrdering() == AtomicOrdering::Release ||
1707         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1708         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1709       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1710                                           MOI.getInstrAddrSpace());
1711     }
1712 
1713     if (MOI.getOrdering() == AtomicOrdering::Release ||
1714         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1715         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1716         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1717       Changed |= CC->insertRelease(MI, MOI.getScope(),
1718                                    MOI.getOrderingAddrSpace(),
1719                                    MOI.getIsCrossAddressSpaceOrdering(),
1720                                    Position::BEFORE);
1721 
1722     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1723         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1724         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1725         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1726         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1727       Changed |= CC->insertWait(MI, MOI.getScope(),
1728                                 MOI.getInstrAddrSpace(),
1729                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1730                                                    SIMemOp::STORE,
1731                                 MOI.getIsCrossAddressSpaceOrdering(),
1732                                 Position::AFTER);
1733       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1734                                    MOI.getOrderingAddrSpace(),
1735                                    Position::AFTER);
1736     }
1737 
1738     return Changed;
1739   }
1740 
1741   return Changed;
1742 }
1743 
runOnMachineFunction(MachineFunction & MF)1744 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1745   bool Changed = false;
1746 
1747   SIMemOpAccess MOA(MF);
1748   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1749 
1750   for (auto &MBB : MF) {
1751     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1752 
1753       // Unbundle instructions after the post-RA scheduler.
1754       if (MI->isBundle() && MI->mayLoadOrStore()) {
1755         MachineBasicBlock::instr_iterator II(MI->getIterator());
1756         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1757              I != E && I->isBundledWithPred(); ++I) {
1758           I->unbundleFromPred();
1759           for (MachineOperand &MO : I->operands())
1760             if (MO.isReg())
1761               MO.setIsInternalRead(false);
1762         }
1763 
1764         MI->eraseFromParent();
1765         MI = II->getIterator();
1766       }
1767 
1768       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1769         continue;
1770 
1771       if (const auto &MOI = MOA.getLoadInfo(MI))
1772         Changed |= expandLoad(MOI.getValue(), MI);
1773       else if (const auto &MOI = MOA.getStoreInfo(MI))
1774         Changed |= expandStore(MOI.getValue(), MI);
1775       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1776         Changed |= expandAtomicFence(MOI.getValue(), MI);
1777       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1778         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1779     }
1780   }
1781 
1782   Changed |= removeAtomicPseudoMIs();
1783   return Changed;
1784 }
1785 
1786 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1787 
1788 char SIMemoryLegalizer::ID = 0;
1789 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1790 
createSIMemoryLegalizerPass()1791 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1792   return new SIMemoryLegalizer();
1793 }
1794