1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   Optional<SIMemOpInfo> constructFromMIWithMMO(
233       const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "None" otherwise.
241   Optional<SIMemOpInfo> getLoadInfo(
242       const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "None" otherwise.
245   Optional<SIMemOpInfo> getStoreInfo(
246       const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Atomic fence info if \p MI is an atomic fence operation,
249   /// "None" otherwise.
250   Optional<SIMemOpInfo> getAtomicFenceInfo(
251       const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254   /// rmw operation, "None" otherwise.
255   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256       const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262   /// AMDGPU subtarget info.
263   const GCNSubtarget &ST;
264 
265   /// Instruction info.
266   const SIInstrInfo *TII = nullptr;
267 
268   IsaVersion IV;
269 
270   /// Whether to insert cache invalidating instructions.
271   bool InsertCacheInv;
272 
273   SICacheControl(const GCNSubtarget &ST);
274 
275   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276   /// \returns Returns true if \p MI is modified, false otherwise.
277   bool enableNamedBit(const MachineBasicBlock::iterator MI,
278                       AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282   /// Create a cache control for the subtarget \p ST.
283   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285   /// Update \p MI memory load instruction to bypass any caches up to
286   /// the \p Scope memory scope for address spaces \p
287   /// AddrSpace. Return true iff the instruction was modified.
288   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace) const = 0;
291 
292   /// Update \p MI memory store instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296                                       SIAtomicScope Scope,
297                                       SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory read-modify-write instruction to bypass any caches up
300   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301   /// iff the instruction was modified.
302   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303                                     SIAtomicScope Scope,
304                                     SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory instruction of kind \p Op associated with address
307   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308   /// true iff the instruction was modified.
309   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310                                               SIAtomicAddrSpace AddrSpace,
311                                               SIMemOp Op, bool IsVolatile,
312                                               bool IsNonTemporal) const = 0;
313 
314   /// Inserts any necessary instructions at position \p Pos relative
315   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316   /// \p Op associated with address spaces \p AddrSpace have completed. Used
317   /// between memory instructions to enforce the order they become visible as
318   /// observed by other memory instructions executing in memory scope \p Scope.
319   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320   /// address spaces. Returns true iff any instructions inserted.
321   virtual bool insertWait(MachineBasicBlock::iterator &MI,
322                           SIAtomicScope Scope,
323                           SIAtomicAddrSpace AddrSpace,
324                           SIMemOp Op,
325                           bool IsCrossAddrSpaceOrdering,
326                           Position Pos) const = 0;
327 
328   /// Inserts any necessary instructions at position \p Pos relative to
329   /// instruction \p MI to ensure any subsequent memory instructions of this
330   /// thread with address spaces \p AddrSpace will observe the previous memory
331   /// operations by any thread for memory scopes up to memory scope \p Scope .
332   /// Returns true iff any instructions inserted.
333   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334                              SIAtomicScope Scope,
335                              SIAtomicAddrSpace AddrSpace,
336                              Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure previous memory instructions by this thread
340   /// with address spaces \p AddrSpace have completed and can be observed by
341   /// subsequent memory instructions by any thread executing in memory scope \p
342   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343   /// between address spaces. Returns true iff any instructions inserted.
344   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              bool IsCrossAddrSpaceOrdering,
348                              Position Pos) const = 0;
349 
350   /// Virtual destructor to allow derivations to be deleted.
351   virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359   /// is modified, false otherwise.
360   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361     return enableNamedBit(MI, AMDGPU::CPol::GLC);
362   }
363 
364   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365   /// is modified, false otherwise.
366   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367     return enableNamedBit(MI, AMDGPU::CPol::SLC);
368   }
369 
370 public:
371 
372   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
373 
374   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375                              SIAtomicScope Scope,
376                              SIAtomicAddrSpace AddrSpace) const override;
377 
378   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379                               SIAtomicScope Scope,
380                               SIAtomicAddrSpace AddrSpace) const override;
381 
382   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383                             SIAtomicScope Scope,
384                             SIAtomicAddrSpace AddrSpace) const override;
385 
386   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388                                       bool IsVolatile,
389                                       bool IsNonTemporal) const override;
390 
391   bool insertWait(MachineBasicBlock::iterator &MI,
392                   SIAtomicScope Scope,
393                   SIAtomicAddrSpace AddrSpace,
394                   SIMemOp Op,
395                   bool IsCrossAddrSpaceOrdering,
396                   Position Pos) const override;
397 
398   bool insertAcquire(MachineBasicBlock::iterator &MI,
399                      SIAtomicScope Scope,
400                      SIAtomicAddrSpace AddrSpace,
401                      Position Pos) const override;
402 
403   bool insertRelease(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      bool IsCrossAddrSpaceOrdering,
407                      Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
414 
415   bool insertAcquire(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
425   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
426 
427   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428                              SIAtomicScope Scope,
429                              SIAtomicAddrSpace AddrSpace) const override;
430 
431   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432                               SIAtomicScope Scope,
433                               SIAtomicAddrSpace AddrSpace) const override;
434 
435   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436                             SIAtomicScope Scope,
437                             SIAtomicAddrSpace AddrSpace) const override;
438 
439   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441                                       bool IsVolatile,
442                                       bool IsNonTemporal) const override;
443 
444   bool insertWait(MachineBasicBlock::iterator &MI,
445                   SIAtomicScope Scope,
446                   SIAtomicAddrSpace AddrSpace,
447                   SIMemOp Op,
448                   bool IsCrossAddrSpaceOrdering,
449                   Position Pos) const override;
450 
451   bool insertAcquire(MachineBasicBlock::iterator &MI,
452                      SIAtomicScope Scope,
453                      SIAtomicAddrSpace AddrSpace,
454                      Position Pos) const override;
455 
456   bool insertRelease(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      bool IsCrossAddrSpaceOrdering,
460                      Position Pos) const override;
461 };
462 
463 class SIGfx940CacheControl : public SIGfx90ACacheControl {
464 protected:
465 
466   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
467   /// is modified, false otherwise.
468   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
469     return enableNamedBit(MI, AMDGPU::CPol::SC0);
470   }
471 
472   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
473   /// is modified, false otherwise.
474   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
475     return enableNamedBit(MI, AMDGPU::CPol::SC1);
476   }
477 
478   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
479   /// is modified, false otherwise.
480   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
481     return enableNamedBit(MI, AMDGPU::CPol::NT);
482   }
483 
484 public:
485 
486   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
487 
488   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
489                              SIAtomicScope Scope,
490                              SIAtomicAddrSpace AddrSpace) const override;
491 
492   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
493                               SIAtomicScope Scope,
494                               SIAtomicAddrSpace AddrSpace) const override;
495 
496   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
497                             SIAtomicScope Scope,
498                             SIAtomicAddrSpace AddrSpace) const override;
499 
500   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
501                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
502                                       bool IsVolatile,
503                                       bool IsNonTemporal) const override;
504 
505   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
506                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
507 
508   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
509                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
510                      Position Pos) const override;
511 };
512 
513 class SIGfx10CacheControl : public SIGfx7CacheControl {
514 protected:
515 
516   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
517   /// is modified, false otherwise.
518   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
519     return enableNamedBit(MI, AMDGPU::CPol::DLC);
520   }
521 
522 public:
523 
524   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
525 
526   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
527                              SIAtomicScope Scope,
528                              SIAtomicAddrSpace AddrSpace) const override;
529 
530   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
531                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
532                                       bool IsVolatile,
533                                       bool IsNonTemporal) const override;
534 
535   bool insertWait(MachineBasicBlock::iterator &MI,
536                   SIAtomicScope Scope,
537                   SIAtomicAddrSpace AddrSpace,
538                   SIMemOp Op,
539                   bool IsCrossAddrSpaceOrdering,
540                   Position Pos) const override;
541 
542   bool insertAcquire(MachineBasicBlock::iterator &MI,
543                      SIAtomicScope Scope,
544                      SIAtomicAddrSpace AddrSpace,
545                      Position Pos) const override;
546 };
547 
548 class SIGfx11CacheControl : public SIGfx10CacheControl {
549 public:
550   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
551 
552   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
553                              SIAtomicScope Scope,
554                              SIAtomicAddrSpace AddrSpace) const override;
555 
556   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
557                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
558                                       bool IsVolatile,
559                                       bool IsNonTemporal) const override;
560 };
561 
562 class SIMemoryLegalizer final : public MachineFunctionPass {
563 private:
564 
565   /// Cache Control.
566   std::unique_ptr<SICacheControl> CC = nullptr;
567 
568   /// List of atomic pseudo instructions.
569   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
570 
571   /// Return true iff instruction \p MI is a atomic instruction that
572   /// returns a result.
573   bool isAtomicRet(const MachineInstr &MI) const {
574     return SIInstrInfo::isAtomicRet(MI);
575   }
576 
577   /// Removes all processed atomic pseudo instructions from the current
578   /// function. Returns true if current function is modified, false otherwise.
579   bool removeAtomicPseudoMIs();
580 
581   /// Expands load operation \p MI. Returns true if instructions are
582   /// added/deleted or \p MI is modified, false otherwise.
583   bool expandLoad(const SIMemOpInfo &MOI,
584                   MachineBasicBlock::iterator &MI);
585   /// Expands store operation \p MI. Returns true if instructions are
586   /// added/deleted or \p MI is modified, false otherwise.
587   bool expandStore(const SIMemOpInfo &MOI,
588                    MachineBasicBlock::iterator &MI);
589   /// Expands atomic fence operation \p MI. Returns true if
590   /// instructions are added/deleted or \p MI is modified, false otherwise.
591   bool expandAtomicFence(const SIMemOpInfo &MOI,
592                          MachineBasicBlock::iterator &MI);
593   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
594   /// instructions are added/deleted or \p MI is modified, false otherwise.
595   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
596                                 MachineBasicBlock::iterator &MI);
597 
598 public:
599   static char ID;
600 
601   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
602 
603   void getAnalysisUsage(AnalysisUsage &AU) const override {
604     AU.setPreservesCFG();
605     MachineFunctionPass::getAnalysisUsage(AU);
606   }
607 
608   StringRef getPassName() const override {
609     return PASS_NAME;
610   }
611 
612   bool runOnMachineFunction(MachineFunction &MF) override;
613 };
614 
615 } // end namespace anonymous
616 
617 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
618                                       const char *Msg) const {
619   const Function &Func = MI->getParent()->getParent()->getFunction();
620   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
621   Func.getContext().diagnose(Diag);
622 }
623 
624 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
625 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
626                                SIAtomicAddrSpace InstrAddrSpace) const {
627   if (SSID == SyncScope::System)
628     return std::make_tuple(SIAtomicScope::SYSTEM,
629                            SIAtomicAddrSpace::ATOMIC,
630                            true);
631   if (SSID == MMI->getAgentSSID())
632     return std::make_tuple(SIAtomicScope::AGENT,
633                            SIAtomicAddrSpace::ATOMIC,
634                            true);
635   if (SSID == MMI->getWorkgroupSSID())
636     return std::make_tuple(SIAtomicScope::WORKGROUP,
637                            SIAtomicAddrSpace::ATOMIC,
638                            true);
639   if (SSID == MMI->getWavefrontSSID())
640     return std::make_tuple(SIAtomicScope::WAVEFRONT,
641                            SIAtomicAddrSpace::ATOMIC,
642                            true);
643   if (SSID == SyncScope::SingleThread)
644     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
645                            SIAtomicAddrSpace::ATOMIC,
646                            true);
647   if (SSID == MMI->getSystemOneAddressSpaceSSID())
648     return std::make_tuple(SIAtomicScope::SYSTEM,
649                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
650                            false);
651   if (SSID == MMI->getAgentOneAddressSpaceSSID())
652     return std::make_tuple(SIAtomicScope::AGENT,
653                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
654                            false);
655   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
656     return std::make_tuple(SIAtomicScope::WORKGROUP,
657                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
658                            false);
659   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
660     return std::make_tuple(SIAtomicScope::WAVEFRONT,
661                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
662                            false);
663   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
664     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
665                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
666                            false);
667   return None;
668 }
669 
670 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
671   if (AS == AMDGPUAS::FLAT_ADDRESS)
672     return SIAtomicAddrSpace::FLAT;
673   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
674     return SIAtomicAddrSpace::GLOBAL;
675   if (AS == AMDGPUAS::LOCAL_ADDRESS)
676     return SIAtomicAddrSpace::LDS;
677   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
678     return SIAtomicAddrSpace::SCRATCH;
679   if (AS == AMDGPUAS::REGION_ADDRESS)
680     return SIAtomicAddrSpace::GDS;
681 
682   return SIAtomicAddrSpace::OTHER;
683 }
684 
685 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
686   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
687 }
688 
689 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
690     const MachineBasicBlock::iterator &MI) const {
691   assert(MI->getNumMemOperands() > 0);
692 
693   SyncScope::ID SSID = SyncScope::SingleThread;
694   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
695   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
696   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
697   bool IsNonTemporal = true;
698   bool IsVolatile = false;
699 
700   // Validator should check whether or not MMOs cover the entire set of
701   // locations accessed by the memory instruction.
702   for (const auto &MMO : MI->memoperands()) {
703     IsNonTemporal &= MMO->isNonTemporal();
704     IsVolatile |= MMO->isVolatile();
705     InstrAddrSpace |=
706       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
707     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
708     if (OpOrdering != AtomicOrdering::NotAtomic) {
709       const auto &IsSyncScopeInclusion =
710           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
711       if (!IsSyncScopeInclusion) {
712         reportUnsupported(MI,
713           "Unsupported non-inclusive atomic synchronization scope");
714         return None;
715       }
716 
717       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
718       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
719       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
720              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
721       FailureOrdering =
722           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
723     }
724   }
725 
726   SIAtomicScope Scope = SIAtomicScope::NONE;
727   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
728   bool IsCrossAddressSpaceOrdering = false;
729   if (Ordering != AtomicOrdering::NotAtomic) {
730     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
731     if (!ScopeOrNone) {
732       reportUnsupported(MI, "Unsupported atomic synchronization scope");
733       return None;
734     }
735     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
736         *ScopeOrNone;
737     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
738         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
739         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
740       reportUnsupported(MI, "Unsupported atomic address space");
741       return None;
742     }
743   }
744   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
745                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
746                      IsNonTemporal);
747 }
748 
749 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
750     const MachineBasicBlock::iterator &MI) const {
751   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752 
753   if (!(MI->mayLoad() && !MI->mayStore()))
754     return None;
755 
756   // Be conservative if there are no memory operands.
757   if (MI->getNumMemOperands() == 0)
758     return SIMemOpInfo();
759 
760   return constructFromMIWithMMO(MI);
761 }
762 
763 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
764     const MachineBasicBlock::iterator &MI) const {
765   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
766 
767   if (!(!MI->mayLoad() && MI->mayStore()))
768     return None;
769 
770   // Be conservative if there are no memory operands.
771   if (MI->getNumMemOperands() == 0)
772     return SIMemOpInfo();
773 
774   return constructFromMIWithMMO(MI);
775 }
776 
777 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
778     const MachineBasicBlock::iterator &MI) const {
779   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
780 
781   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
782     return None;
783 
784   AtomicOrdering Ordering =
785     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
786 
787   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
788   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
789   if (!ScopeOrNone) {
790     reportUnsupported(MI, "Unsupported atomic synchronization scope");
791     return None;
792   }
793 
794   SIAtomicScope Scope = SIAtomicScope::NONE;
795   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
796   bool IsCrossAddressSpaceOrdering = false;
797   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
798       *ScopeOrNone;
799 
800   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
801       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
802     reportUnsupported(MI, "Unsupported atomic address space");
803     return None;
804   }
805 
806   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
807                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
808 }
809 
810 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
811     const MachineBasicBlock::iterator &MI) const {
812   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
813 
814   if (!(MI->mayLoad() && MI->mayStore()))
815     return None;
816 
817   // Be conservative if there are no memory operands.
818   if (MI->getNumMemOperands() == 0)
819     return SIMemOpInfo();
820 
821   return constructFromMIWithMMO(MI);
822 }
823 
824 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
825   TII = ST.getInstrInfo();
826   IV = getIsaVersion(ST.getCPU());
827   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
828 }
829 
830 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
831                                     AMDGPU::CPol::CPol Bit) const {
832   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
833   if (!CPol)
834     return false;
835 
836   CPol->setImm(CPol->getImm() | Bit);
837   return true;
838 }
839 
840 /* static */
841 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
842   GCNSubtarget::Generation Generation = ST.getGeneration();
843   if (ST.hasGFX940Insts())
844     return std::make_unique<SIGfx940CacheControl>(ST);
845   if (ST.hasGFX90AInsts())
846     return std::make_unique<SIGfx90ACacheControl>(ST);
847   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
848     return std::make_unique<SIGfx6CacheControl>(ST);
849   if (Generation < AMDGPUSubtarget::GFX10)
850     return std::make_unique<SIGfx7CacheControl>(ST);
851   if (Generation < AMDGPUSubtarget::GFX11)
852     return std::make_unique<SIGfx10CacheControl>(ST);
853   return std::make_unique<SIGfx11CacheControl>(ST);
854 }
855 
856 bool SIGfx6CacheControl::enableLoadCacheBypass(
857     const MachineBasicBlock::iterator &MI,
858     SIAtomicScope Scope,
859     SIAtomicAddrSpace AddrSpace) const {
860   assert(MI->mayLoad() && !MI->mayStore());
861   bool Changed = false;
862 
863   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
864     switch (Scope) {
865     case SIAtomicScope::SYSTEM:
866     case SIAtomicScope::AGENT:
867       // Set L1 cache policy to MISS_EVICT.
868       // Note: there is no L2 cache bypass policy at the ISA level.
869       Changed |= enableGLCBit(MI);
870       break;
871     case SIAtomicScope::WORKGROUP:
872     case SIAtomicScope::WAVEFRONT:
873     case SIAtomicScope::SINGLETHREAD:
874       // No cache to bypass.
875       break;
876     default:
877       llvm_unreachable("Unsupported synchronization scope");
878     }
879   }
880 
881   /// The scratch address space does not need the global memory caches
882   /// to be bypassed as all memory operations by the same thread are
883   /// sequentially consistent, and no other thread can access scratch
884   /// memory.
885 
886   /// Other address spaces do not have a cache.
887 
888   return Changed;
889 }
890 
891 bool SIGfx6CacheControl::enableStoreCacheBypass(
892     const MachineBasicBlock::iterator &MI,
893     SIAtomicScope Scope,
894     SIAtomicAddrSpace AddrSpace) const {
895   assert(!MI->mayLoad() && MI->mayStore());
896   bool Changed = false;
897 
898   /// The L1 cache is write through so does not need to be bypassed. There is no
899   /// bypass control for the L2 cache at the isa level.
900 
901   return Changed;
902 }
903 
904 bool SIGfx6CacheControl::enableRMWCacheBypass(
905     const MachineBasicBlock::iterator &MI,
906     SIAtomicScope Scope,
907     SIAtomicAddrSpace AddrSpace) const {
908   assert(MI->mayLoad() && MI->mayStore());
909   bool Changed = false;
910 
911   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
912   /// bypassed, and the GLC bit is instead used to indicate if they are
913   /// return or no-return.
914   /// Note: there is no L2 cache coherent bypass control at the ISA level.
915 
916   return Changed;
917 }
918 
919 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
920     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
921     bool IsVolatile, bool IsNonTemporal) const {
922   // Only handle load and store, not atomic read-modify-write insructions. The
923   // latter use glc to indicate if the atomic returns a result and so must not
924   // be used for cache control.
925   assert(MI->mayLoad() ^ MI->mayStore());
926 
927   // Only update load and store, not LLVM IR atomic read-modify-write
928   // instructions. The latter are always marked as volatile so cannot sensibly
929   // handle it as do not want to pessimize all atomics. Also they do not support
930   // the nontemporal attribute.
931   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
932 
933   bool Changed = false;
934 
935   if (IsVolatile) {
936     // Set L1 cache policy to be MISS_EVICT for load instructions
937     // and MISS_LRU for store instructions.
938     // Note: there is no L2 cache bypass policy at the ISA level.
939     if (Op == SIMemOp::LOAD)
940       Changed |= enableGLCBit(MI);
941 
942     // Ensure operation has completed at system scope to cause all volatile
943     // operations to be visible outside the program in a global order. Do not
944     // request cross address space as only the global address space can be
945     // observable outside the program, so no need to cause a waitcnt for LDS
946     // address space operations.
947     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
948                           Position::AFTER);
949 
950     return Changed;
951   }
952 
953   if (IsNonTemporal) {
954     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
955     // for both loads and stores, and the L2 cache policy to STREAM.
956     Changed |= enableGLCBit(MI);
957     Changed |= enableSLCBit(MI);
958     return Changed;
959   }
960 
961   return Changed;
962 }
963 
964 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
965                                     SIAtomicScope Scope,
966                                     SIAtomicAddrSpace AddrSpace,
967                                     SIMemOp Op,
968                                     bool IsCrossAddrSpaceOrdering,
969                                     Position Pos) const {
970   bool Changed = false;
971 
972   MachineBasicBlock &MBB = *MI->getParent();
973   DebugLoc DL = MI->getDebugLoc();
974 
975   if (Pos == Position::AFTER)
976     ++MI;
977 
978   bool VMCnt = false;
979   bool LGKMCnt = false;
980 
981   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
982       SIAtomicAddrSpace::NONE) {
983     switch (Scope) {
984     case SIAtomicScope::SYSTEM:
985     case SIAtomicScope::AGENT:
986       VMCnt |= true;
987       break;
988     case SIAtomicScope::WORKGROUP:
989     case SIAtomicScope::WAVEFRONT:
990     case SIAtomicScope::SINGLETHREAD:
991       // The L1 cache keeps all memory operations in order for
992       // wavefronts in the same work-group.
993       break;
994     default:
995       llvm_unreachable("Unsupported synchronization scope");
996     }
997   }
998 
999   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1000     switch (Scope) {
1001     case SIAtomicScope::SYSTEM:
1002     case SIAtomicScope::AGENT:
1003     case SIAtomicScope::WORKGROUP:
1004       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1005       // not needed as LDS operations for all waves are executed in a total
1006       // global ordering as observed by all waves. Required if also
1007       // synchronizing with global/GDS memory as LDS operations could be
1008       // reordered with respect to later global/GDS memory operations of the
1009       // same wave.
1010       LGKMCnt |= IsCrossAddrSpaceOrdering;
1011       break;
1012     case SIAtomicScope::WAVEFRONT:
1013     case SIAtomicScope::SINGLETHREAD:
1014       // The LDS keeps all memory operations in order for
1015       // the same wavefront.
1016       break;
1017     default:
1018       llvm_unreachable("Unsupported synchronization scope");
1019     }
1020   }
1021 
1022   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1023     switch (Scope) {
1024     case SIAtomicScope::SYSTEM:
1025     case SIAtomicScope::AGENT:
1026       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1027       // is not needed as GDS operations for all waves are executed in a total
1028       // global ordering as observed by all waves. Required if also
1029       // synchronizing with global/LDS memory as GDS operations could be
1030       // reordered with respect to later global/LDS memory operations of the
1031       // same wave.
1032       LGKMCnt |= IsCrossAddrSpaceOrdering;
1033       break;
1034     case SIAtomicScope::WORKGROUP:
1035     case SIAtomicScope::WAVEFRONT:
1036     case SIAtomicScope::SINGLETHREAD:
1037       // The GDS keeps all memory operations in order for
1038       // the same work-group.
1039       break;
1040     default:
1041       llvm_unreachable("Unsupported synchronization scope");
1042     }
1043   }
1044 
1045   if (VMCnt || LGKMCnt) {
1046     unsigned WaitCntImmediate =
1047       AMDGPU::encodeWaitcnt(IV,
1048                             VMCnt ? 0 : getVmcntBitMask(IV),
1049                             getExpcntBitMask(IV),
1050                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1051     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1052     Changed = true;
1053   }
1054 
1055   if (Pos == Position::AFTER)
1056     --MI;
1057 
1058   return Changed;
1059 }
1060 
1061 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1062                                        SIAtomicScope Scope,
1063                                        SIAtomicAddrSpace AddrSpace,
1064                                        Position Pos) const {
1065   if (!InsertCacheInv)
1066     return false;
1067 
1068   bool Changed = false;
1069 
1070   MachineBasicBlock &MBB = *MI->getParent();
1071   DebugLoc DL = MI->getDebugLoc();
1072 
1073   if (Pos == Position::AFTER)
1074     ++MI;
1075 
1076   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1077     switch (Scope) {
1078     case SIAtomicScope::SYSTEM:
1079     case SIAtomicScope::AGENT:
1080       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1081       Changed = true;
1082       break;
1083     case SIAtomicScope::WORKGROUP:
1084     case SIAtomicScope::WAVEFRONT:
1085     case SIAtomicScope::SINGLETHREAD:
1086       // No cache to invalidate.
1087       break;
1088     default:
1089       llvm_unreachable("Unsupported synchronization scope");
1090     }
1091   }
1092 
1093   /// The scratch address space does not need the global memory cache
1094   /// to be flushed as all memory operations by the same thread are
1095   /// sequentially consistent, and no other thread can access scratch
1096   /// memory.
1097 
1098   /// Other address spaces do not have a cache.
1099 
1100   if (Pos == Position::AFTER)
1101     --MI;
1102 
1103   return Changed;
1104 }
1105 
1106 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1107                                        SIAtomicScope Scope,
1108                                        SIAtomicAddrSpace AddrSpace,
1109                                        bool IsCrossAddrSpaceOrdering,
1110                                        Position Pos) const {
1111   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1112                     IsCrossAddrSpaceOrdering, Pos);
1113 }
1114 
1115 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1116                                        SIAtomicScope Scope,
1117                                        SIAtomicAddrSpace AddrSpace,
1118                                        Position Pos) const {
1119   if (!InsertCacheInv)
1120     return false;
1121 
1122   bool Changed = false;
1123 
1124   MachineBasicBlock &MBB = *MI->getParent();
1125   DebugLoc DL = MI->getDebugLoc();
1126 
1127   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1128 
1129   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1130                                     ? AMDGPU::BUFFER_WBINVL1
1131                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1132 
1133   if (Pos == Position::AFTER)
1134     ++MI;
1135 
1136   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1137     switch (Scope) {
1138     case SIAtomicScope::SYSTEM:
1139     case SIAtomicScope::AGENT:
1140       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1141       Changed = true;
1142       break;
1143     case SIAtomicScope::WORKGROUP:
1144     case SIAtomicScope::WAVEFRONT:
1145     case SIAtomicScope::SINGLETHREAD:
1146       // No cache to invalidate.
1147       break;
1148     default:
1149       llvm_unreachable("Unsupported synchronization scope");
1150     }
1151   }
1152 
1153   /// The scratch address space does not need the global memory cache
1154   /// to be flushed as all memory operations by the same thread are
1155   /// sequentially consistent, and no other thread can access scratch
1156   /// memory.
1157 
1158   /// Other address spaces do not have a cache.
1159 
1160   if (Pos == Position::AFTER)
1161     --MI;
1162 
1163   return Changed;
1164 }
1165 
1166 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1167     const MachineBasicBlock::iterator &MI,
1168     SIAtomicScope Scope,
1169     SIAtomicAddrSpace AddrSpace) const {
1170   assert(MI->mayLoad() && !MI->mayStore());
1171   bool Changed = false;
1172 
1173   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174     switch (Scope) {
1175     case SIAtomicScope::SYSTEM:
1176     case SIAtomicScope::AGENT:
1177       // Set the L1 cache policy to MISS_LRU.
1178       // Note: there is no L2 cache bypass policy at the ISA level.
1179       Changed |= enableGLCBit(MI);
1180       break;
1181     case SIAtomicScope::WORKGROUP:
1182       // In threadgroup split mode the waves of a work-group can be executing on
1183       // different CUs. Therefore need to bypass the L1 which is per CU.
1184       // Otherwise in non-threadgroup split mode all waves of a work-group are
1185       // on the same CU, and so the L1 does not need to be bypassed.
1186       if (ST.isTgSplitEnabled())
1187         Changed |= enableGLCBit(MI);
1188       break;
1189     case SIAtomicScope::WAVEFRONT:
1190     case SIAtomicScope::SINGLETHREAD:
1191       // No cache to bypass.
1192       break;
1193     default:
1194       llvm_unreachable("Unsupported synchronization scope");
1195     }
1196   }
1197 
1198   /// The scratch address space does not need the global memory caches
1199   /// to be bypassed as all memory operations by the same thread are
1200   /// sequentially consistent, and no other thread can access scratch
1201   /// memory.
1202 
1203   /// Other address spaces do not have a cache.
1204 
1205   return Changed;
1206 }
1207 
1208 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1209     const MachineBasicBlock::iterator &MI,
1210     SIAtomicScope Scope,
1211     SIAtomicAddrSpace AddrSpace) const {
1212   assert(!MI->mayLoad() && MI->mayStore());
1213   bool Changed = false;
1214 
1215   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1216     switch (Scope) {
1217     case SIAtomicScope::SYSTEM:
1218     case SIAtomicScope::AGENT:
1219       /// Do not set glc for store atomic operations as they implicitly write
1220       /// through the L1 cache.
1221       break;
1222     case SIAtomicScope::WORKGROUP:
1223     case SIAtomicScope::WAVEFRONT:
1224     case SIAtomicScope::SINGLETHREAD:
1225       // No cache to bypass. Store atomics implicitly write through the L1
1226       // cache.
1227       break;
1228     default:
1229       llvm_unreachable("Unsupported synchronization scope");
1230     }
1231   }
1232 
1233   /// The scratch address space does not need the global memory caches
1234   /// to be bypassed as all memory operations by the same thread are
1235   /// sequentially consistent, and no other thread can access scratch
1236   /// memory.
1237 
1238   /// Other address spaces do not have a cache.
1239 
1240   return Changed;
1241 }
1242 
1243 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1244     const MachineBasicBlock::iterator &MI,
1245     SIAtomicScope Scope,
1246     SIAtomicAddrSpace AddrSpace) const {
1247   assert(MI->mayLoad() && MI->mayStore());
1248   bool Changed = false;
1249 
1250   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1251     switch (Scope) {
1252     case SIAtomicScope::SYSTEM:
1253     case SIAtomicScope::AGENT:
1254       /// Do not set glc for RMW atomic operations as they implicitly bypass
1255       /// the L1 cache, and the glc bit is instead used to indicate if they are
1256       /// return or no-return.
1257       break;
1258     case SIAtomicScope::WORKGROUP:
1259     case SIAtomicScope::WAVEFRONT:
1260     case SIAtomicScope::SINGLETHREAD:
1261       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1262       break;
1263     default:
1264       llvm_unreachable("Unsupported synchronization scope");
1265     }
1266   }
1267 
1268   return Changed;
1269 }
1270 
1271 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1272     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1273     bool IsVolatile, bool IsNonTemporal) const {
1274   // Only handle load and store, not atomic read-modify-write insructions. The
1275   // latter use glc to indicate if the atomic returns a result and so must not
1276   // be used for cache control.
1277   assert(MI->mayLoad() ^ MI->mayStore());
1278 
1279   // Only update load and store, not LLVM IR atomic read-modify-write
1280   // instructions. The latter are always marked as volatile so cannot sensibly
1281   // handle it as do not want to pessimize all atomics. Also they do not support
1282   // the nontemporal attribute.
1283   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1284 
1285   bool Changed = false;
1286 
1287   if (IsVolatile) {
1288     // Set L1 cache policy to be MISS_EVICT for load instructions
1289     // and MISS_LRU for store instructions.
1290     // Note: there is no L2 cache bypass policy at the ISA level.
1291     if (Op == SIMemOp::LOAD)
1292       Changed |= enableGLCBit(MI);
1293 
1294     // Ensure operation has completed at system scope to cause all volatile
1295     // operations to be visible outside the program in a global order. Do not
1296     // request cross address space as only the global address space can be
1297     // observable outside the program, so no need to cause a waitcnt for LDS
1298     // address space operations.
1299     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1300                           Position::AFTER);
1301 
1302     return Changed;
1303   }
1304 
1305   if (IsNonTemporal) {
1306     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1307     // for both loads and stores, and the L2 cache policy to STREAM.
1308     Changed |= enableGLCBit(MI);
1309     Changed |= enableSLCBit(MI);
1310     return Changed;
1311   }
1312 
1313   return Changed;
1314 }
1315 
1316 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1317                                       SIAtomicScope Scope,
1318                                       SIAtomicAddrSpace AddrSpace,
1319                                       SIMemOp Op,
1320                                       bool IsCrossAddrSpaceOrdering,
1321                                       Position Pos) const {
1322   if (ST.isTgSplitEnabled()) {
1323     // In threadgroup split mode the waves of a work-group can be executing on
1324     // different CUs. Therefore need to wait for global or GDS memory operations
1325     // to complete to ensure they are visible to waves in the other CUs.
1326     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1327     // the same CU, so no need to wait for global memory as all waves in the
1328     // work-group access the same the L1, nor wait for GDS as access are ordered
1329     // on a CU.
1330     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1331                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1332         (Scope == SIAtomicScope::WORKGROUP)) {
1333       // Same as GFX7 using agent scope.
1334       Scope = SIAtomicScope::AGENT;
1335     }
1336     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1337     // LDS memory operations.
1338     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1339   }
1340   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1341                                         IsCrossAddrSpaceOrdering, Pos);
1342 }
1343 
1344 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1345                                          SIAtomicScope Scope,
1346                                          SIAtomicAddrSpace AddrSpace,
1347                                          Position Pos) const {
1348   if (!InsertCacheInv)
1349     return false;
1350 
1351   bool Changed = false;
1352 
1353   MachineBasicBlock &MBB = *MI->getParent();
1354   DebugLoc DL = MI->getDebugLoc();
1355 
1356   if (Pos == Position::AFTER)
1357     ++MI;
1358 
1359   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1360     switch (Scope) {
1361     case SIAtomicScope::SYSTEM:
1362       // Ensures that following loads will not see stale remote VMEM data or
1363       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1364       // CC will never be stale due to the local memory probes.
1365       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1366       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1367       // hardware does not reorder memory operations by the same wave with
1368       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1369       // remove any cache lines of earlier writes by the same wave and ensures
1370       // later reads by the same wave will refetch the cache lines.
1371       Changed = true;
1372       break;
1373     case SIAtomicScope::AGENT:
1374       // Same as GFX7.
1375       break;
1376     case SIAtomicScope::WORKGROUP:
1377       // In threadgroup split mode the waves of a work-group can be executing on
1378       // different CUs. Therefore need to invalidate the L1 which is per CU.
1379       // Otherwise in non-threadgroup split mode all waves of a work-group are
1380       // on the same CU, and so the L1 does not need to be invalidated.
1381       if (ST.isTgSplitEnabled()) {
1382         // Same as GFX7 using agent scope.
1383         Scope = SIAtomicScope::AGENT;
1384       }
1385       break;
1386     case SIAtomicScope::WAVEFRONT:
1387     case SIAtomicScope::SINGLETHREAD:
1388       // Same as GFX7.
1389       break;
1390     default:
1391       llvm_unreachable("Unsupported synchronization scope");
1392     }
1393   }
1394 
1395   /// The scratch address space does not need the global memory cache
1396   /// to be flushed as all memory operations by the same thread are
1397   /// sequentially consistent, and no other thread can access scratch
1398   /// memory.
1399 
1400   /// Other address spaces do not have a cache.
1401 
1402   if (Pos == Position::AFTER)
1403     --MI;
1404 
1405   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1406 
1407   return Changed;
1408 }
1409 
1410 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1411                                          SIAtomicScope Scope,
1412                                          SIAtomicAddrSpace AddrSpace,
1413                                          bool IsCrossAddrSpaceOrdering,
1414                                          Position Pos) const {
1415   bool Changed = false;
1416 
1417   MachineBasicBlock &MBB = *MI->getParent();
1418   DebugLoc DL = MI->getDebugLoc();
1419 
1420   if (Pos == Position::AFTER)
1421     ++MI;
1422 
1423   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1424     switch (Scope) {
1425     case SIAtomicScope::SYSTEM:
1426       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1427       // hardware does not reorder memory operations by the same wave with
1428       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1429       // to initiate writeback of any dirty cache lines of earlier writes by the
1430       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1431       // writeback has completed.
1432       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1433         // Set SC bits to indicate system scope.
1434         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1435       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1436       // vmcnt(0)" needed by the "BUFFER_WBL2".
1437       Changed = true;
1438       break;
1439     case SIAtomicScope::AGENT:
1440     case SIAtomicScope::WORKGROUP:
1441     case SIAtomicScope::WAVEFRONT:
1442     case SIAtomicScope::SINGLETHREAD:
1443       // Same as GFX7.
1444       break;
1445     default:
1446       llvm_unreachable("Unsupported synchronization scope");
1447     }
1448   }
1449 
1450   if (Pos == Position::AFTER)
1451     --MI;
1452 
1453   Changed |=
1454       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1455                                         IsCrossAddrSpaceOrdering, Pos);
1456 
1457   return Changed;
1458 }
1459 
1460 bool SIGfx940CacheControl::enableLoadCacheBypass(
1461     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1462     SIAtomicAddrSpace AddrSpace) const {
1463   assert(MI->mayLoad() && !MI->mayStore());
1464   bool Changed = false;
1465 
1466   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1467     switch (Scope) {
1468     case SIAtomicScope::SYSTEM:
1469       // Set SC bits to indicate system scope.
1470       Changed |= enableSC0Bit(MI);
1471       Changed |= enableSC1Bit(MI);
1472       break;
1473     case SIAtomicScope::AGENT:
1474       // Set SC bits to indicate agent scope.
1475       Changed |= enableSC1Bit(MI);
1476       break;
1477     case SIAtomicScope::WORKGROUP:
1478       // In threadgroup split mode the waves of a work-group can be executing on
1479       // different CUs. Therefore need to bypass the L1 which is per CU.
1480       // Otherwise in non-threadgroup split mode all waves of a work-group are
1481       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1482       // bits to indicate work-group scope will do this automatically.
1483       Changed |= enableSC0Bit(MI);
1484       break;
1485     case SIAtomicScope::WAVEFRONT:
1486     case SIAtomicScope::SINGLETHREAD:
1487       // Leave SC bits unset to indicate wavefront scope.
1488       break;
1489     default:
1490       llvm_unreachable("Unsupported synchronization scope");
1491     }
1492   }
1493 
1494   /// The scratch address space does not need the global memory caches
1495   /// to be bypassed as all memory operations by the same thread are
1496   /// sequentially consistent, and no other thread can access scratch
1497   /// memory.
1498 
1499   /// Other address spaces do not have a cache.
1500 
1501   return Changed;
1502 }
1503 
1504 bool SIGfx940CacheControl::enableStoreCacheBypass(
1505     const MachineBasicBlock::iterator &MI,
1506     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1507   assert(!MI->mayLoad() && MI->mayStore());
1508   bool Changed = false;
1509 
1510   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1511     switch (Scope) {
1512     case SIAtomicScope::SYSTEM:
1513       // Set SC bits to indicate system scope.
1514       Changed |= enableSC0Bit(MI);
1515       Changed |= enableSC1Bit(MI);
1516       break;
1517     case SIAtomicScope::AGENT:
1518       // Set SC bits to indicate agent scope.
1519       Changed |= enableSC1Bit(MI);
1520       break;
1521     case SIAtomicScope::WORKGROUP:
1522       // Set SC bits to indicate workgroup scope.
1523       Changed |= enableSC0Bit(MI);
1524       break;
1525     case SIAtomicScope::WAVEFRONT:
1526     case SIAtomicScope::SINGLETHREAD:
1527       // Leave SC bits unset to indicate wavefront scope.
1528       break;
1529     default:
1530       llvm_unreachable("Unsupported synchronization scope");
1531     }
1532   }
1533 
1534   /// The scratch address space does not need the global memory caches
1535   /// to be bypassed as all memory operations by the same thread are
1536   /// sequentially consistent, and no other thread can access scratch
1537   /// memory.
1538 
1539   /// Other address spaces do not have a cache.
1540 
1541   return Changed;
1542 }
1543 
1544 bool SIGfx940CacheControl::enableRMWCacheBypass(
1545     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1546     SIAtomicAddrSpace AddrSpace) const {
1547   assert(MI->mayLoad() && MI->mayStore());
1548   bool Changed = false;
1549 
1550   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1551     switch (Scope) {
1552     case SIAtomicScope::SYSTEM:
1553       // Set SC1 bit to indicate system scope.
1554       Changed |= enableSC1Bit(MI);
1555       break;
1556     case SIAtomicScope::AGENT:
1557     case SIAtomicScope::WORKGROUP:
1558     case SIAtomicScope::WAVEFRONT:
1559     case SIAtomicScope::SINGLETHREAD:
1560       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1561       // to indicate system or agent scope. The SC0 bit is used to indicate if
1562       // they are return or no-return. Leave SC1 bit unset to indicate agent
1563       // scope.
1564       break;
1565     default:
1566       llvm_unreachable("Unsupported synchronization scope");
1567     }
1568   }
1569 
1570   return Changed;
1571 }
1572 
1573 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1574     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1575     bool IsVolatile, bool IsNonTemporal) const {
1576   // Only handle load and store, not atomic read-modify-write insructions. The
1577   // latter use glc to indicate if the atomic returns a result and so must not
1578   // be used for cache control.
1579   assert(MI->mayLoad() ^ MI->mayStore());
1580 
1581   // Only update load and store, not LLVM IR atomic read-modify-write
1582   // instructions. The latter are always marked as volatile so cannot sensibly
1583   // handle it as do not want to pessimize all atomics. Also they do not support
1584   // the nontemporal attribute.
1585   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1586 
1587   bool Changed = false;
1588 
1589   if (IsVolatile) {
1590     // Set SC bits to indicate system scope.
1591     Changed |= enableSC0Bit(MI);
1592     Changed |= enableSC1Bit(MI);
1593 
1594     // Ensure operation has completed at system scope to cause all volatile
1595     // operations to be visible outside the program in a global order. Do not
1596     // request cross address space as only the global address space can be
1597     // observable outside the program, so no need to cause a waitcnt for LDS
1598     // address space operations.
1599     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1600                           Position::AFTER);
1601 
1602     return Changed;
1603   }
1604 
1605   if (IsNonTemporal) {
1606     Changed |= enableNTBit(MI);
1607     return Changed;
1608   }
1609 
1610   return Changed;
1611 }
1612 
1613 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1614                                          SIAtomicScope Scope,
1615                                          SIAtomicAddrSpace AddrSpace,
1616                                          Position Pos) const {
1617   if (!InsertCacheInv)
1618     return false;
1619 
1620   bool Changed = false;
1621 
1622   MachineBasicBlock &MBB = *MI->getParent();
1623   DebugLoc DL = MI->getDebugLoc();
1624 
1625   if (Pos == Position::AFTER)
1626     ++MI;
1627 
1628   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1629     switch (Scope) {
1630     case SIAtomicScope::SYSTEM:
1631       // Ensures that following loads will not see stale remote VMEM data or
1632       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1633       // CC will never be stale due to the local memory probes.
1634       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1635           // Set SC bits to indicate system scope.
1636           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1637       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1638       // hardware does not reorder memory operations by the same wave with
1639       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1640       // remove any cache lines of earlier writes by the same wave and ensures
1641       // later reads by the same wave will refetch the cache lines.
1642       Changed = true;
1643       break;
1644     case SIAtomicScope::AGENT:
1645       // Ensures that following loads will not see stale remote date or local
1646       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1647       // due to the memory probes.
1648       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1649           // Set SC bits to indicate agent scope.
1650           .addImm(AMDGPU::CPol::SC1);
1651       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1652       // does not reorder memory operations with respect to preceeding buffer
1653       // invalidate. The invalidate is guaranteed to remove any cache lines of
1654       // earlier writes and ensures later writes will refetch the cache lines.
1655       Changed = true;
1656       break;
1657     case SIAtomicScope::WORKGROUP:
1658       // In threadgroup split mode the waves of a work-group can be executing on
1659       // different CUs. Therefore need to invalidate the L1 which is per CU.
1660       // Otherwise in non-threadgroup split mode all waves of a work-group are
1661       // on the same CU, and so the L1 does not need to be invalidated.
1662       if (ST.isTgSplitEnabled()) {
1663         // Ensures L1 is invalidated if in threadgroup split mode. In
1664         // non-threadgroup split mode it is a NOP, but no point generating it in
1665         // that case if know not in that mode.
1666         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1667             // Set SC bits to indicate work-group scope.
1668             .addImm(AMDGPU::CPol::SC0);
1669         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1670         // does not reorder memory operations with respect to preceeding buffer
1671         // invalidate. The invalidate is guaranteed to remove any cache lines of
1672         // earlier writes and ensures later writes will refetch the cache lines.
1673         Changed = true;
1674       }
1675       break;
1676     case SIAtomicScope::WAVEFRONT:
1677     case SIAtomicScope::SINGLETHREAD:
1678       // Could generate "BUFFER_INV" but it would do nothing as there are no
1679       // caches to invalidate.
1680       break;
1681     default:
1682       llvm_unreachable("Unsupported synchronization scope");
1683     }
1684   }
1685 
1686   /// The scratch address space does not need the global memory cache
1687   /// to be flushed as all memory operations by the same thread are
1688   /// sequentially consistent, and no other thread can access scratch
1689   /// memory.
1690 
1691   /// Other address spaces do not have a cache.
1692 
1693   if (Pos == Position::AFTER)
1694     --MI;
1695 
1696   return Changed;
1697 }
1698 
1699 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1700                                          SIAtomicScope Scope,
1701                                          SIAtomicAddrSpace AddrSpace,
1702                                          bool IsCrossAddrSpaceOrdering,
1703                                          Position Pos) const {
1704   bool Changed = false;
1705 
1706   MachineBasicBlock &MBB = *MI->getParent();
1707   DebugLoc DL = MI->getDebugLoc();
1708 
1709   if (Pos == Position::AFTER)
1710     ++MI;
1711 
1712   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1713     switch (Scope) {
1714     case SIAtomicScope::SYSTEM:
1715       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1716       // hardware does not reorder memory operations by the same wave with
1717       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1718       // to initiate writeback of any dirty cache lines of earlier writes by the
1719       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1720       // writeback has completed.
1721       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1722           // Set SC bits to indicate system scope.
1723           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1724       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1725       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1726       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1727       Changed = true;
1728       break;
1729     case SIAtomicScope::AGENT:
1730       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1731           // Set SC bits to indicate agent scope.
1732           .addImm(AMDGPU::CPol::SC1);
1733 
1734       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1735       // SIAtomicScope::AGENT, the following insertWait will generate the
1736       // required "S_WAITCNT vmcnt(0)".
1737       Changed = true;
1738       break;
1739     case SIAtomicScope::WORKGROUP:
1740     case SIAtomicScope::WAVEFRONT:
1741     case SIAtomicScope::SINGLETHREAD:
1742       // Do not generate "BUFFER_WBL2" as there are no caches it would
1743       // writeback, and would require an otherwise unnecessary
1744       // "S_WAITCNT vmcnt(0)".
1745       break;
1746     default:
1747       llvm_unreachable("Unsupported synchronization scope");
1748     }
1749   }
1750 
1751   if (Pos == Position::AFTER)
1752     --MI;
1753 
1754   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1755   // S_WAITCNT needed.
1756   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1757                         IsCrossAddrSpaceOrdering, Pos);
1758 
1759   return Changed;
1760 }
1761 
1762 bool SIGfx10CacheControl::enableLoadCacheBypass(
1763     const MachineBasicBlock::iterator &MI,
1764     SIAtomicScope Scope,
1765     SIAtomicAddrSpace AddrSpace) const {
1766   assert(MI->mayLoad() && !MI->mayStore());
1767   bool Changed = false;
1768 
1769   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1770     switch (Scope) {
1771     case SIAtomicScope::SYSTEM:
1772     case SIAtomicScope::AGENT:
1773       // Set the L0 and L1 cache policies to MISS_EVICT.
1774       // Note: there is no L2 cache coherent bypass control at the ISA level.
1775       Changed |= enableGLCBit(MI);
1776       Changed |= enableDLCBit(MI);
1777       break;
1778     case SIAtomicScope::WORKGROUP:
1779       // In WGP mode the waves of a work-group can be executing on either CU of
1780       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1781       // CU mode all waves of a work-group are on the same CU, and so the L0
1782       // does not need to be bypassed.
1783       if (!ST.isCuModeEnabled())
1784         Changed |= enableGLCBit(MI);
1785       break;
1786     case SIAtomicScope::WAVEFRONT:
1787     case SIAtomicScope::SINGLETHREAD:
1788       // No cache to bypass.
1789       break;
1790     default:
1791       llvm_unreachable("Unsupported synchronization scope");
1792     }
1793   }
1794 
1795   /// The scratch address space does not need the global memory caches
1796   /// to be bypassed as all memory operations by the same thread are
1797   /// sequentially consistent, and no other thread can access scratch
1798   /// memory.
1799 
1800   /// Other address spaces do not have a cache.
1801 
1802   return Changed;
1803 }
1804 
1805 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1806     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1807     bool IsVolatile, bool IsNonTemporal) const {
1808 
1809   // Only handle load and store, not atomic read-modify-write insructions. The
1810   // latter use glc to indicate if the atomic returns a result and so must not
1811   // be used for cache control.
1812   assert(MI->mayLoad() ^ MI->mayStore());
1813 
1814   // Only update load and store, not LLVM IR atomic read-modify-write
1815   // instructions. The latter are always marked as volatile so cannot sensibly
1816   // handle it as do not want to pessimize all atomics. Also they do not support
1817   // the nontemporal attribute.
1818   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1819 
1820   bool Changed = false;
1821 
1822   if (IsVolatile) {
1823     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1824     // and MISS_LRU for store instructions.
1825     // Note: there is no L2 cache coherent bypass control at the ISA level.
1826     if (Op == SIMemOp::LOAD) {
1827       Changed |= enableGLCBit(MI);
1828       Changed |= enableDLCBit(MI);
1829     }
1830 
1831     // Ensure operation has completed at system scope to cause all volatile
1832     // operations to be visible outside the program in a global order. Do not
1833     // request cross address space as only the global address space can be
1834     // observable outside the program, so no need to cause a waitcnt for LDS
1835     // address space operations.
1836     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1837                           Position::AFTER);
1838     return Changed;
1839   }
1840 
1841   if (IsNonTemporal) {
1842     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1843     // and L2 cache policy to STREAM.
1844     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1845     // to MISS_EVICT and the L2 cache policy to STREAM.
1846     if (Op == SIMemOp::STORE)
1847       Changed |= enableGLCBit(MI);
1848     Changed |= enableSLCBit(MI);
1849 
1850     return Changed;
1851   }
1852 
1853   return Changed;
1854 }
1855 
1856 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1857                                      SIAtomicScope Scope,
1858                                      SIAtomicAddrSpace AddrSpace,
1859                                      SIMemOp Op,
1860                                      bool IsCrossAddrSpaceOrdering,
1861                                      Position Pos) const {
1862   bool Changed = false;
1863 
1864   MachineBasicBlock &MBB = *MI->getParent();
1865   DebugLoc DL = MI->getDebugLoc();
1866 
1867   if (Pos == Position::AFTER)
1868     ++MI;
1869 
1870   bool VMCnt = false;
1871   bool VSCnt = false;
1872   bool LGKMCnt = false;
1873 
1874   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1875       SIAtomicAddrSpace::NONE) {
1876     switch (Scope) {
1877     case SIAtomicScope::SYSTEM:
1878     case SIAtomicScope::AGENT:
1879       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1880         VMCnt |= true;
1881       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1882         VSCnt |= true;
1883       break;
1884     case SIAtomicScope::WORKGROUP:
1885       // In WGP mode the waves of a work-group can be executing on either CU of
1886       // the WGP. Therefore need to wait for operations to complete to ensure
1887       // they are visible to waves in the other CU as the L0 is per CU.
1888       // Otherwise in CU mode and all waves of a work-group are on the same CU
1889       // which shares the same L0.
1890       if (!ST.isCuModeEnabled()) {
1891         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1892           VMCnt |= true;
1893         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1894           VSCnt |= true;
1895       }
1896       break;
1897     case SIAtomicScope::WAVEFRONT:
1898     case SIAtomicScope::SINGLETHREAD:
1899       // The L0 cache keeps all memory operations in order for
1900       // work-items in the same wavefront.
1901       break;
1902     default:
1903       llvm_unreachable("Unsupported synchronization scope");
1904     }
1905   }
1906 
1907   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1908     switch (Scope) {
1909     case SIAtomicScope::SYSTEM:
1910     case SIAtomicScope::AGENT:
1911     case SIAtomicScope::WORKGROUP:
1912       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1913       // not needed as LDS operations for all waves are executed in a total
1914       // global ordering as observed by all waves. Required if also
1915       // synchronizing with global/GDS memory as LDS operations could be
1916       // reordered with respect to later global/GDS memory operations of the
1917       // same wave.
1918       LGKMCnt |= IsCrossAddrSpaceOrdering;
1919       break;
1920     case SIAtomicScope::WAVEFRONT:
1921     case SIAtomicScope::SINGLETHREAD:
1922       // The LDS keeps all memory operations in order for
1923       // the same wavefront.
1924       break;
1925     default:
1926       llvm_unreachable("Unsupported synchronization scope");
1927     }
1928   }
1929 
1930   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1931     switch (Scope) {
1932     case SIAtomicScope::SYSTEM:
1933     case SIAtomicScope::AGENT:
1934       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1935       // is not needed as GDS operations for all waves are executed in a total
1936       // global ordering as observed by all waves. Required if also
1937       // synchronizing with global/LDS memory as GDS operations could be
1938       // reordered with respect to later global/LDS memory operations of the
1939       // same wave.
1940       LGKMCnt |= IsCrossAddrSpaceOrdering;
1941       break;
1942     case SIAtomicScope::WORKGROUP:
1943     case SIAtomicScope::WAVEFRONT:
1944     case SIAtomicScope::SINGLETHREAD:
1945       // The GDS keeps all memory operations in order for
1946       // the same work-group.
1947       break;
1948     default:
1949       llvm_unreachable("Unsupported synchronization scope");
1950     }
1951   }
1952 
1953   if (VMCnt || LGKMCnt) {
1954     unsigned WaitCntImmediate =
1955       AMDGPU::encodeWaitcnt(IV,
1956                             VMCnt ? 0 : getVmcntBitMask(IV),
1957                             getExpcntBitMask(IV),
1958                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1959     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1960     Changed = true;
1961   }
1962 
1963   if (VSCnt) {
1964     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1965       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1966       .addImm(0);
1967     Changed = true;
1968   }
1969 
1970   if (Pos == Position::AFTER)
1971     --MI;
1972 
1973   return Changed;
1974 }
1975 
1976 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1977                                         SIAtomicScope Scope,
1978                                         SIAtomicAddrSpace AddrSpace,
1979                                         Position Pos) const {
1980   if (!InsertCacheInv)
1981     return false;
1982 
1983   bool Changed = false;
1984 
1985   MachineBasicBlock &MBB = *MI->getParent();
1986   DebugLoc DL = MI->getDebugLoc();
1987 
1988   if (Pos == Position::AFTER)
1989     ++MI;
1990 
1991   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1992     switch (Scope) {
1993     case SIAtomicScope::SYSTEM:
1994     case SIAtomicScope::AGENT:
1995       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1996       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1997       Changed = true;
1998       break;
1999     case SIAtomicScope::WORKGROUP:
2000       // In WGP mode the waves of a work-group can be executing on either CU of
2001       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2002       // in CU mode and all waves of a work-group are on the same CU, and so the
2003       // L0 does not need to be invalidated.
2004       if (!ST.isCuModeEnabled()) {
2005         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2006         Changed = true;
2007       }
2008       break;
2009     case SIAtomicScope::WAVEFRONT:
2010     case SIAtomicScope::SINGLETHREAD:
2011       // No cache to invalidate.
2012       break;
2013     default:
2014       llvm_unreachable("Unsupported synchronization scope");
2015     }
2016   }
2017 
2018   /// The scratch address space does not need the global memory cache
2019   /// to be flushed as all memory operations by the same thread are
2020   /// sequentially consistent, and no other thread can access scratch
2021   /// memory.
2022 
2023   /// Other address spaces do not have a cache.
2024 
2025   if (Pos == Position::AFTER)
2026     --MI;
2027 
2028   return Changed;
2029 }
2030 
2031 bool SIGfx11CacheControl::enableLoadCacheBypass(
2032     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2033     SIAtomicAddrSpace AddrSpace) const {
2034   assert(MI->mayLoad() && !MI->mayStore());
2035   bool Changed = false;
2036 
2037   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2038     switch (Scope) {
2039     case SIAtomicScope::SYSTEM:
2040     case SIAtomicScope::AGENT:
2041       // Set the L0 and L1 cache policies to MISS_EVICT.
2042       // Note: there is no L2 cache coherent bypass control at the ISA level.
2043       Changed |= enableGLCBit(MI);
2044       break;
2045     case SIAtomicScope::WORKGROUP:
2046       // In WGP mode the waves of a work-group can be executing on either CU of
2047       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2048       // CU mode all waves of a work-group are on the same CU, and so the L0
2049       // does not need to be bypassed.
2050       if (!ST.isCuModeEnabled())
2051         Changed |= enableGLCBit(MI);
2052       break;
2053     case SIAtomicScope::WAVEFRONT:
2054     case SIAtomicScope::SINGLETHREAD:
2055       // No cache to bypass.
2056       break;
2057     default:
2058       llvm_unreachable("Unsupported synchronization scope");
2059     }
2060   }
2061 
2062   /// The scratch address space does not need the global memory caches
2063   /// to be bypassed as all memory operations by the same thread are
2064   /// sequentially consistent, and no other thread can access scratch
2065   /// memory.
2066 
2067   /// Other address spaces do not have a cache.
2068 
2069   return Changed;
2070 }
2071 
2072 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2073     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2074     bool IsVolatile, bool IsNonTemporal) const {
2075 
2076   // Only handle load and store, not atomic read-modify-write insructions. The
2077   // latter use glc to indicate if the atomic returns a result and so must not
2078   // be used for cache control.
2079   assert(MI->mayLoad() ^ MI->mayStore());
2080 
2081   // Only update load and store, not LLVM IR atomic read-modify-write
2082   // instructions. The latter are always marked as volatile so cannot sensibly
2083   // handle it as do not want to pessimize all atomics. Also they do not support
2084   // the nontemporal attribute.
2085   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2086 
2087   bool Changed = false;
2088 
2089   if (IsVolatile) {
2090     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2091     // and MISS_LRU for store instructions.
2092     // Note: there is no L2 cache coherent bypass control at the ISA level.
2093     if (Op == SIMemOp::LOAD)
2094       Changed |= enableGLCBit(MI);
2095 
2096     // Set MALL NOALLOC for load and store instructions.
2097     Changed |= enableDLCBit(MI);
2098 
2099     // Ensure operation has completed at system scope to cause all volatile
2100     // operations to be visible outside the program in a global order. Do not
2101     // request cross address space as only the global address space can be
2102     // observable outside the program, so no need to cause a waitcnt for LDS
2103     // address space operations.
2104     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2105                           Position::AFTER);
2106     return Changed;
2107   }
2108 
2109   if (IsNonTemporal) {
2110     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2111     // and L2 cache policy to STREAM.
2112     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2113     // to MISS_EVICT and the L2 cache policy to STREAM.
2114     if (Op == SIMemOp::STORE)
2115       Changed |= enableGLCBit(MI);
2116     Changed |= enableSLCBit(MI);
2117 
2118     // Set MALL NOALLOC for load and store instructions.
2119     Changed |= enableDLCBit(MI);
2120     return Changed;
2121   }
2122 
2123   return Changed;
2124 }
2125 
2126 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2127   if (AtomicPseudoMIs.empty())
2128     return false;
2129 
2130   for (auto &MI : AtomicPseudoMIs)
2131     MI->eraseFromParent();
2132 
2133   AtomicPseudoMIs.clear();
2134   return true;
2135 }
2136 
2137 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2138                                    MachineBasicBlock::iterator &MI) {
2139   assert(MI->mayLoad() && !MI->mayStore());
2140 
2141   bool Changed = false;
2142 
2143   if (MOI.isAtomic()) {
2144     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2145         MOI.getOrdering() == AtomicOrdering::Acquire ||
2146         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2147       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2148                                            MOI.getOrderingAddrSpace());
2149     }
2150 
2151     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2152       Changed |= CC->insertWait(MI, MOI.getScope(),
2153                                 MOI.getOrderingAddrSpace(),
2154                                 SIMemOp::LOAD | SIMemOp::STORE,
2155                                 MOI.getIsCrossAddressSpaceOrdering(),
2156                                 Position::BEFORE);
2157 
2158     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2159         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2160       Changed |= CC->insertWait(MI, MOI.getScope(),
2161                                 MOI.getInstrAddrSpace(),
2162                                 SIMemOp::LOAD,
2163                                 MOI.getIsCrossAddressSpaceOrdering(),
2164                                 Position::AFTER);
2165       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2166                                    MOI.getOrderingAddrSpace(),
2167                                    Position::AFTER);
2168     }
2169 
2170     return Changed;
2171   }
2172 
2173   // Atomic instructions already bypass caches to the scope specified by the
2174   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2175   // need additional treatment.
2176   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2177                                                 SIMemOp::LOAD, MOI.isVolatile(),
2178                                                 MOI.isNonTemporal());
2179   return Changed;
2180 }
2181 
2182 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2183                                     MachineBasicBlock::iterator &MI) {
2184   assert(!MI->mayLoad() && MI->mayStore());
2185 
2186   bool Changed = false;
2187 
2188   if (MOI.isAtomic()) {
2189     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2190         MOI.getOrdering() == AtomicOrdering::Release ||
2191         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2192       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2193                                             MOI.getOrderingAddrSpace());
2194     }
2195 
2196     if (MOI.getOrdering() == AtomicOrdering::Release ||
2197         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2198       Changed |= CC->insertRelease(MI, MOI.getScope(),
2199                                    MOI.getOrderingAddrSpace(),
2200                                    MOI.getIsCrossAddressSpaceOrdering(),
2201                                    Position::BEFORE);
2202 
2203     return Changed;
2204   }
2205 
2206   // Atomic instructions already bypass caches to the scope specified by the
2207   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2208   // need additional treatment.
2209   Changed |= CC->enableVolatileAndOrNonTemporal(
2210       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2211       MOI.isNonTemporal());
2212   return Changed;
2213 }
2214 
2215 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2216                                           MachineBasicBlock::iterator &MI) {
2217   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2218 
2219   AtomicPseudoMIs.push_back(MI);
2220   bool Changed = false;
2221 
2222   if (MOI.isAtomic()) {
2223     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2224         MOI.getOrdering() == AtomicOrdering::Release ||
2225         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2226         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2227       /// TODO: This relies on a barrier always generating a waitcnt
2228       /// for LDS to ensure it is not reordered with the completion of
2229       /// the proceeding LDS operations. If barrier had a memory
2230       /// ordering and memory scope, then library does not need to
2231       /// generate a fence. Could add support in this file for
2232       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2233       /// adding S_WAITCNT before a S_BARRIER.
2234       Changed |= CC->insertRelease(MI, MOI.getScope(),
2235                                    MOI.getOrderingAddrSpace(),
2236                                    MOI.getIsCrossAddressSpaceOrdering(),
2237                                    Position::BEFORE);
2238 
2239     // TODO: If both release and invalidate are happening they could be combined
2240     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2241     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2242     // track cache invalidate and write back instructions.
2243 
2244     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2245         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2246         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2247       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2248                                    MOI.getOrderingAddrSpace(),
2249                                    Position::BEFORE);
2250 
2251     return Changed;
2252   }
2253 
2254   return Changed;
2255 }
2256 
2257 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2258   MachineBasicBlock::iterator &MI) {
2259   assert(MI->mayLoad() && MI->mayStore());
2260 
2261   bool Changed = false;
2262 
2263   if (MOI.isAtomic()) {
2264     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2265         MOI.getOrdering() == AtomicOrdering::Acquire ||
2266         MOI.getOrdering() == AtomicOrdering::Release ||
2267         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2268         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2269       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2270                                           MOI.getInstrAddrSpace());
2271     }
2272 
2273     if (MOI.getOrdering() == AtomicOrdering::Release ||
2274         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2275         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2276         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2277       Changed |= CC->insertRelease(MI, MOI.getScope(),
2278                                    MOI.getOrderingAddrSpace(),
2279                                    MOI.getIsCrossAddressSpaceOrdering(),
2280                                    Position::BEFORE);
2281 
2282     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2283         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2284         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2285         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2286         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2287       Changed |= CC->insertWait(MI, MOI.getScope(),
2288                                 MOI.getInstrAddrSpace(),
2289                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2290                                                    SIMemOp::STORE,
2291                                 MOI.getIsCrossAddressSpaceOrdering(),
2292                                 Position::AFTER);
2293       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2294                                    MOI.getOrderingAddrSpace(),
2295                                    Position::AFTER);
2296     }
2297 
2298     return Changed;
2299   }
2300 
2301   return Changed;
2302 }
2303 
2304 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2305   bool Changed = false;
2306 
2307   SIMemOpAccess MOA(MF);
2308   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2309 
2310   for (auto &MBB : MF) {
2311     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2312 
2313       // Unbundle instructions after the post-RA scheduler.
2314       if (MI->isBundle() && MI->mayLoadOrStore()) {
2315         MachineBasicBlock::instr_iterator II(MI->getIterator());
2316         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2317              I != E && I->isBundledWithPred(); ++I) {
2318           I->unbundleFromPred();
2319           for (MachineOperand &MO : I->operands())
2320             if (MO.isReg())
2321               MO.setIsInternalRead(false);
2322         }
2323 
2324         MI->eraseFromParent();
2325         MI = II->getIterator();
2326       }
2327 
2328       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2329         continue;
2330 
2331       if (const auto &MOI = MOA.getLoadInfo(MI))
2332         Changed |= expandLoad(MOI.value(), MI);
2333       else if (const auto &MOI = MOA.getStoreInfo(MI))
2334         Changed |= expandStore(MOI.value(), MI);
2335       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2336         Changed |= expandAtomicFence(MOI.value(), MI);
2337       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2338         Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI);
2339     }
2340   }
2341 
2342   Changed |= removeAtomicPseudoMIs();
2343   return Changed;
2344 }
2345 
2346 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2347 
2348 char SIMemoryLegalizer::ID = 0;
2349 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2350 
2351 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2352   return new SIMemoryLegalizer();
2353 }
2354