1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
355                                    MachineBasicBlock::iterator &MI) const {
356     return false;
357   }
358 };
359 
360 class SIGfx6CacheControl : public SICacheControl {
361 protected:
362 
363   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::GLC);
367   }
368 
369   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit(MI, AMDGPU::CPol::SLC);
373   }
374 
375 public:
376 
377   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378 
379   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380                              SIAtomicScope Scope,
381                              SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384                               SIAtomicScope Scope,
385                               SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388                             SIAtomicScope Scope,
389                             SIAtomicAddrSpace AddrSpace) const override;
390 
391   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393                                       bool IsVolatile,
394                                       bool IsNonTemporal) const override;
395 
396   bool insertWait(MachineBasicBlock::iterator &MI,
397                   SIAtomicScope Scope,
398                   SIAtomicAddrSpace AddrSpace,
399                   SIMemOp Op,
400                   bool IsCrossAddrSpaceOrdering,
401                   Position Pos) const override;
402 
403   bool insertAcquire(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      Position Pos) const override;
407 
408   bool insertRelease(MachineBasicBlock::iterator &MI,
409                      SIAtomicScope Scope,
410                      SIAtomicAddrSpace AddrSpace,
411                      bool IsCrossAddrSpaceOrdering,
412                      Position Pos) const override;
413 };
414 
415 class SIGfx7CacheControl : public SIGfx6CacheControl {
416 public:
417 
418   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419 
420   bool insertAcquire(MachineBasicBlock::iterator &MI,
421                      SIAtomicScope Scope,
422                      SIAtomicAddrSpace AddrSpace,
423                      Position Pos) const override;
424 
425 };
426 
427 class SIGfx90ACacheControl : public SIGfx7CacheControl {
428 public:
429 
430   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431 
432   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433                              SIAtomicScope Scope,
434                              SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437                               SIAtomicScope Scope,
438                               SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441                             SIAtomicScope Scope,
442                             SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446                                       bool IsVolatile,
447                                       bool IsNonTemporal) const override;
448 
449   bool insertWait(MachineBasicBlock::iterator &MI,
450                   SIAtomicScope Scope,
451                   SIAtomicAddrSpace AddrSpace,
452                   SIMemOp Op,
453                   bool IsCrossAddrSpaceOrdering,
454                   Position Pos) const override;
455 
456   bool insertAcquire(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      Position Pos) const override;
460 
461   bool insertRelease(MachineBasicBlock::iterator &MI,
462                      SIAtomicScope Scope,
463                      SIAtomicAddrSpace AddrSpace,
464                      bool IsCrossAddrSpaceOrdering,
465                      Position Pos) const override;
466 };
467 
468 class SIGfx940CacheControl : public SIGfx90ACacheControl {
469 protected:
470 
471   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472   /// is modified, false otherwise.
473   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474     return enableNamedBit(MI, AMDGPU::CPol::SC0);
475   }
476 
477   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478   /// is modified, false otherwise.
479   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480     return enableNamedBit(MI, AMDGPU::CPol::SC1);
481   }
482 
483   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484   /// is modified, false otherwise.
485   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486     return enableNamedBit(MI, AMDGPU::CPol::NT);
487   }
488 
489 public:
490 
491   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492 
493   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494                              SIAtomicScope Scope,
495                              SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498                               SIAtomicScope Scope,
499                               SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502                             SIAtomicScope Scope,
503                             SIAtomicAddrSpace AddrSpace) const override;
504 
505   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507                                       bool IsVolatile,
508                                       bool IsNonTemporal) const override;
509 
510   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512 
513   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515                      Position Pos) const override;
516 
517   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518                            MachineBasicBlock::iterator &MI) const override {
519     bool Changed = false;
520     if (ST.hasForceStoreSC0SC1() &&
521         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522                                     SIAtomicAddrSpace::GLOBAL |
523                                     SIAtomicAddrSpace::OTHER)) !=
524          SIAtomicAddrSpace::NONE) {
525       Changed |= enableSC0Bit(MI);
526       Changed |= enableSC1Bit(MI);
527     }
528     return Changed;
529   }
530 };
531 
532 class SIGfx10CacheControl : public SIGfx7CacheControl {
533 protected:
534 
535   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536   /// is modified, false otherwise.
537   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538     return enableNamedBit(MI, AMDGPU::CPol::DLC);
539   }
540 
541 public:
542 
543   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544 
545   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546                              SIAtomicScope Scope,
547                              SIAtomicAddrSpace AddrSpace) const override;
548 
549   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551                                       bool IsVolatile,
552                                       bool IsNonTemporal) const override;
553 
554   bool insertWait(MachineBasicBlock::iterator &MI,
555                   SIAtomicScope Scope,
556                   SIAtomicAddrSpace AddrSpace,
557                   SIMemOp Op,
558                   bool IsCrossAddrSpaceOrdering,
559                   Position Pos) const override;
560 
561   bool insertAcquire(MachineBasicBlock::iterator &MI,
562                      SIAtomicScope Scope,
563                      SIAtomicAddrSpace AddrSpace,
564                      Position Pos) const override;
565 };
566 
567 class SIGfx11CacheControl : public SIGfx10CacheControl {
568 public:
569   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570 
571   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572                              SIAtomicScope Scope,
573                              SIAtomicAddrSpace AddrSpace) const override;
574 
575   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577                                       bool IsVolatile,
578                                       bool IsNonTemporal) const override;
579 };
580 
581 class SIMemoryLegalizer final : public MachineFunctionPass {
582 private:
583 
584   /// Cache Control.
585   std::unique_ptr<SICacheControl> CC = nullptr;
586 
587   /// List of atomic pseudo instructions.
588   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
589 
590   /// Return true iff instruction \p MI is a atomic instruction that
591   /// returns a result.
592   bool isAtomicRet(const MachineInstr &MI) const {
593     return SIInstrInfo::isAtomicRet(MI);
594   }
595 
596   /// Removes all processed atomic pseudo instructions from the current
597   /// function. Returns true if current function is modified, false otherwise.
598   bool removeAtomicPseudoMIs();
599 
600   /// Expands load operation \p MI. Returns true if instructions are
601   /// added/deleted or \p MI is modified, false otherwise.
602   bool expandLoad(const SIMemOpInfo &MOI,
603                   MachineBasicBlock::iterator &MI);
604   /// Expands store operation \p MI. Returns true if instructions are
605   /// added/deleted or \p MI is modified, false otherwise.
606   bool expandStore(const SIMemOpInfo &MOI,
607                    MachineBasicBlock::iterator &MI);
608   /// Expands atomic fence operation \p MI. Returns true if
609   /// instructions are added/deleted or \p MI is modified, false otherwise.
610   bool expandAtomicFence(const SIMemOpInfo &MOI,
611                          MachineBasicBlock::iterator &MI);
612   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
613   /// instructions are added/deleted or \p MI is modified, false otherwise.
614   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
615                                 MachineBasicBlock::iterator &MI);
616 
617 public:
618   static char ID;
619 
620   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
621 
622   void getAnalysisUsage(AnalysisUsage &AU) const override {
623     AU.setPreservesCFG();
624     MachineFunctionPass::getAnalysisUsage(AU);
625   }
626 
627   StringRef getPassName() const override {
628     return PASS_NAME;
629   }
630 
631   bool runOnMachineFunction(MachineFunction &MF) override;
632 };
633 
634 } // end namespace anonymous
635 
636 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
637                                       const char *Msg) const {
638   const Function &Func = MI->getParent()->getParent()->getFunction();
639   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
640   Func.getContext().diagnose(Diag);
641 }
642 
643 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
644 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
645                                SIAtomicAddrSpace InstrAddrSpace) const {
646   if (SSID == SyncScope::System)
647     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
648   if (SSID == MMI->getAgentSSID())
649     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
650   if (SSID == MMI->getWorkgroupSSID())
651     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
652                       true);
653   if (SSID == MMI->getWavefrontSSID())
654     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
655                       true);
656   if (SSID == SyncScope::SingleThread)
657     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
658                       true);
659   if (SSID == MMI->getSystemOneAddressSpaceSSID())
660     return std::tuple(SIAtomicScope::SYSTEM,
661                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
662   if (SSID == MMI->getAgentOneAddressSpaceSSID())
663     return std::tuple(SIAtomicScope::AGENT,
664                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
665   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
666     return std::tuple(SIAtomicScope::WORKGROUP,
667                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
668   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
669     return std::tuple(SIAtomicScope::WAVEFRONT,
670                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
671   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
672     return std::tuple(SIAtomicScope::SINGLETHREAD,
673                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
674   return std::nullopt;
675 }
676 
677 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
678   if (AS == AMDGPUAS::FLAT_ADDRESS)
679     return SIAtomicAddrSpace::FLAT;
680   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
681     return SIAtomicAddrSpace::GLOBAL;
682   if (AS == AMDGPUAS::LOCAL_ADDRESS)
683     return SIAtomicAddrSpace::LDS;
684   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
685     return SIAtomicAddrSpace::SCRATCH;
686   if (AS == AMDGPUAS::REGION_ADDRESS)
687     return SIAtomicAddrSpace::GDS;
688 
689   return SIAtomicAddrSpace::OTHER;
690 }
691 
692 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
693   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
694 }
695 
696 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
697     const MachineBasicBlock::iterator &MI) const {
698   assert(MI->getNumMemOperands() > 0);
699 
700   SyncScope::ID SSID = SyncScope::SingleThread;
701   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
702   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
703   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
704   bool IsNonTemporal = true;
705   bool IsVolatile = false;
706 
707   // Validator should check whether or not MMOs cover the entire set of
708   // locations accessed by the memory instruction.
709   for (const auto &MMO : MI->memoperands()) {
710     IsNonTemporal &= MMO->isNonTemporal();
711     IsVolatile |= MMO->isVolatile();
712     InstrAddrSpace |=
713       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
714     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
715     if (OpOrdering != AtomicOrdering::NotAtomic) {
716       const auto &IsSyncScopeInclusion =
717           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
718       if (!IsSyncScopeInclusion) {
719         reportUnsupported(MI,
720           "Unsupported non-inclusive atomic synchronization scope");
721         return std::nullopt;
722       }
723 
724       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
725       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
726       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
727              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
728       FailureOrdering =
729           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
730     }
731   }
732 
733   SIAtomicScope Scope = SIAtomicScope::NONE;
734   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
735   bool IsCrossAddressSpaceOrdering = false;
736   if (Ordering != AtomicOrdering::NotAtomic) {
737     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
738     if (!ScopeOrNone) {
739       reportUnsupported(MI, "Unsupported atomic synchronization scope");
740       return std::nullopt;
741     }
742     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
743         *ScopeOrNone;
744     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
745         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
746         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
747       reportUnsupported(MI, "Unsupported atomic address space");
748       return std::nullopt;
749     }
750   }
751   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
752                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
753                      IsNonTemporal);
754 }
755 
756 std::optional<SIMemOpInfo>
757 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
758   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
759 
760   if (!(MI->mayLoad() && !MI->mayStore()))
761     return std::nullopt;
762 
763   // Be conservative if there are no memory operands.
764   if (MI->getNumMemOperands() == 0)
765     return SIMemOpInfo();
766 
767   return constructFromMIWithMMO(MI);
768 }
769 
770 std::optional<SIMemOpInfo>
771 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
772   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
773 
774   if (!(!MI->mayLoad() && MI->mayStore()))
775     return std::nullopt;
776 
777   // Be conservative if there are no memory operands.
778   if (MI->getNumMemOperands() == 0)
779     return SIMemOpInfo();
780 
781   return constructFromMIWithMMO(MI);
782 }
783 
784 std::optional<SIMemOpInfo>
785 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
786   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
787 
788   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
789     return std::nullopt;
790 
791   AtomicOrdering Ordering =
792     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
793 
794   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
795   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
796   if (!ScopeOrNone) {
797     reportUnsupported(MI, "Unsupported atomic synchronization scope");
798     return std::nullopt;
799   }
800 
801   SIAtomicScope Scope = SIAtomicScope::NONE;
802   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
803   bool IsCrossAddressSpaceOrdering = false;
804   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
805       *ScopeOrNone;
806 
807   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
808       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
809     reportUnsupported(MI, "Unsupported atomic address space");
810     return std::nullopt;
811   }
812 
813   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
814                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
815 }
816 
817 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
818     const MachineBasicBlock::iterator &MI) const {
819   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
820 
821   if (!(MI->mayLoad() && MI->mayStore()))
822     return std::nullopt;
823 
824   // Be conservative if there are no memory operands.
825   if (MI->getNumMemOperands() == 0)
826     return SIMemOpInfo();
827 
828   return constructFromMIWithMMO(MI);
829 }
830 
831 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
832   TII = ST.getInstrInfo();
833   IV = getIsaVersion(ST.getCPU());
834   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
835 }
836 
837 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
838                                     AMDGPU::CPol::CPol Bit) const {
839   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
840   if (!CPol)
841     return false;
842 
843   CPol->setImm(CPol->getImm() | Bit);
844   return true;
845 }
846 
847 /* static */
848 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
849   GCNSubtarget::Generation Generation = ST.getGeneration();
850   if (ST.hasGFX940Insts())
851     return std::make_unique<SIGfx940CacheControl>(ST);
852   if (ST.hasGFX90AInsts())
853     return std::make_unique<SIGfx90ACacheControl>(ST);
854   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
855     return std::make_unique<SIGfx6CacheControl>(ST);
856   if (Generation < AMDGPUSubtarget::GFX10)
857     return std::make_unique<SIGfx7CacheControl>(ST);
858   if (Generation < AMDGPUSubtarget::GFX11)
859     return std::make_unique<SIGfx10CacheControl>(ST);
860   return std::make_unique<SIGfx11CacheControl>(ST);
861 }
862 
863 bool SIGfx6CacheControl::enableLoadCacheBypass(
864     const MachineBasicBlock::iterator &MI,
865     SIAtomicScope Scope,
866     SIAtomicAddrSpace AddrSpace) const {
867   assert(MI->mayLoad() && !MI->mayStore());
868   bool Changed = false;
869 
870   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
871     switch (Scope) {
872     case SIAtomicScope::SYSTEM:
873     case SIAtomicScope::AGENT:
874       // Set L1 cache policy to MISS_EVICT.
875       // Note: there is no L2 cache bypass policy at the ISA level.
876       Changed |= enableGLCBit(MI);
877       break;
878     case SIAtomicScope::WORKGROUP:
879     case SIAtomicScope::WAVEFRONT:
880     case SIAtomicScope::SINGLETHREAD:
881       // No cache to bypass.
882       break;
883     default:
884       llvm_unreachable("Unsupported synchronization scope");
885     }
886   }
887 
888   /// The scratch address space does not need the global memory caches
889   /// to be bypassed as all memory operations by the same thread are
890   /// sequentially consistent, and no other thread can access scratch
891   /// memory.
892 
893   /// Other address spaces do not have a cache.
894 
895   return Changed;
896 }
897 
898 bool SIGfx6CacheControl::enableStoreCacheBypass(
899     const MachineBasicBlock::iterator &MI,
900     SIAtomicScope Scope,
901     SIAtomicAddrSpace AddrSpace) const {
902   assert(!MI->mayLoad() && MI->mayStore());
903   bool Changed = false;
904 
905   /// The L1 cache is write through so does not need to be bypassed. There is no
906   /// bypass control for the L2 cache at the isa level.
907 
908   return Changed;
909 }
910 
911 bool SIGfx6CacheControl::enableRMWCacheBypass(
912     const MachineBasicBlock::iterator &MI,
913     SIAtomicScope Scope,
914     SIAtomicAddrSpace AddrSpace) const {
915   assert(MI->mayLoad() && MI->mayStore());
916   bool Changed = false;
917 
918   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
919   /// bypassed, and the GLC bit is instead used to indicate if they are
920   /// return or no-return.
921   /// Note: there is no L2 cache coherent bypass control at the ISA level.
922 
923   return Changed;
924 }
925 
926 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
927     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
928     bool IsVolatile, bool IsNonTemporal) const {
929   // Only handle load and store, not atomic read-modify-write insructions. The
930   // latter use glc to indicate if the atomic returns a result and so must not
931   // be used for cache control.
932   assert(MI->mayLoad() ^ MI->mayStore());
933 
934   // Only update load and store, not LLVM IR atomic read-modify-write
935   // instructions. The latter are always marked as volatile so cannot sensibly
936   // handle it as do not want to pessimize all atomics. Also they do not support
937   // the nontemporal attribute.
938   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
939 
940   bool Changed = false;
941 
942   if (IsVolatile) {
943     // Set L1 cache policy to be MISS_EVICT for load instructions
944     // and MISS_LRU for store instructions.
945     // Note: there is no L2 cache bypass policy at the ISA level.
946     if (Op == SIMemOp::LOAD)
947       Changed |= enableGLCBit(MI);
948 
949     // Ensure operation has completed at system scope to cause all volatile
950     // operations to be visible outside the program in a global order. Do not
951     // request cross address space as only the global address space can be
952     // observable outside the program, so no need to cause a waitcnt for LDS
953     // address space operations.
954     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
955                           Position::AFTER);
956 
957     return Changed;
958   }
959 
960   if (IsNonTemporal) {
961     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
962     // for both loads and stores, and the L2 cache policy to STREAM.
963     Changed |= enableGLCBit(MI);
964     Changed |= enableSLCBit(MI);
965     return Changed;
966   }
967 
968   return Changed;
969 }
970 
971 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
972                                     SIAtomicScope Scope,
973                                     SIAtomicAddrSpace AddrSpace,
974                                     SIMemOp Op,
975                                     bool IsCrossAddrSpaceOrdering,
976                                     Position Pos) const {
977   bool Changed = false;
978 
979   MachineBasicBlock &MBB = *MI->getParent();
980   DebugLoc DL = MI->getDebugLoc();
981 
982   if (Pos == Position::AFTER)
983     ++MI;
984 
985   bool VMCnt = false;
986   bool LGKMCnt = false;
987 
988   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
989       SIAtomicAddrSpace::NONE) {
990     switch (Scope) {
991     case SIAtomicScope::SYSTEM:
992     case SIAtomicScope::AGENT:
993       VMCnt |= true;
994       break;
995     case SIAtomicScope::WORKGROUP:
996     case SIAtomicScope::WAVEFRONT:
997     case SIAtomicScope::SINGLETHREAD:
998       // The L1 cache keeps all memory operations in order for
999       // wavefronts in the same work-group.
1000       break;
1001     default:
1002       llvm_unreachable("Unsupported synchronization scope");
1003     }
1004   }
1005 
1006   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1007     switch (Scope) {
1008     case SIAtomicScope::SYSTEM:
1009     case SIAtomicScope::AGENT:
1010     case SIAtomicScope::WORKGROUP:
1011       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1012       // not needed as LDS operations for all waves are executed in a total
1013       // global ordering as observed by all waves. Required if also
1014       // synchronizing with global/GDS memory as LDS operations could be
1015       // reordered with respect to later global/GDS memory operations of the
1016       // same wave.
1017       LGKMCnt |= IsCrossAddrSpaceOrdering;
1018       break;
1019     case SIAtomicScope::WAVEFRONT:
1020     case SIAtomicScope::SINGLETHREAD:
1021       // The LDS keeps all memory operations in order for
1022       // the same wavefront.
1023       break;
1024     default:
1025       llvm_unreachable("Unsupported synchronization scope");
1026     }
1027   }
1028 
1029   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1030     switch (Scope) {
1031     case SIAtomicScope::SYSTEM:
1032     case SIAtomicScope::AGENT:
1033       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1034       // is not needed as GDS operations for all waves are executed in a total
1035       // global ordering as observed by all waves. Required if also
1036       // synchronizing with global/LDS memory as GDS operations could be
1037       // reordered with respect to later global/LDS memory operations of the
1038       // same wave.
1039       LGKMCnt |= IsCrossAddrSpaceOrdering;
1040       break;
1041     case SIAtomicScope::WORKGROUP:
1042     case SIAtomicScope::WAVEFRONT:
1043     case SIAtomicScope::SINGLETHREAD:
1044       // The GDS keeps all memory operations in order for
1045       // the same work-group.
1046       break;
1047     default:
1048       llvm_unreachable("Unsupported synchronization scope");
1049     }
1050   }
1051 
1052   if (VMCnt || LGKMCnt) {
1053     unsigned WaitCntImmediate =
1054       AMDGPU::encodeWaitcnt(IV,
1055                             VMCnt ? 0 : getVmcntBitMask(IV),
1056                             getExpcntBitMask(IV),
1057                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1058     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1059     Changed = true;
1060   }
1061 
1062   if (Pos == Position::AFTER)
1063     --MI;
1064 
1065   return Changed;
1066 }
1067 
1068 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1069                                        SIAtomicScope Scope,
1070                                        SIAtomicAddrSpace AddrSpace,
1071                                        Position Pos) const {
1072   if (!InsertCacheInv)
1073     return false;
1074 
1075   bool Changed = false;
1076 
1077   MachineBasicBlock &MBB = *MI->getParent();
1078   DebugLoc DL = MI->getDebugLoc();
1079 
1080   if (Pos == Position::AFTER)
1081     ++MI;
1082 
1083   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1084     switch (Scope) {
1085     case SIAtomicScope::SYSTEM:
1086     case SIAtomicScope::AGENT:
1087       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1088       Changed = true;
1089       break;
1090     case SIAtomicScope::WORKGROUP:
1091     case SIAtomicScope::WAVEFRONT:
1092     case SIAtomicScope::SINGLETHREAD:
1093       // No cache to invalidate.
1094       break;
1095     default:
1096       llvm_unreachable("Unsupported synchronization scope");
1097     }
1098   }
1099 
1100   /// The scratch address space does not need the global memory cache
1101   /// to be flushed as all memory operations by the same thread are
1102   /// sequentially consistent, and no other thread can access scratch
1103   /// memory.
1104 
1105   /// Other address spaces do not have a cache.
1106 
1107   if (Pos == Position::AFTER)
1108     --MI;
1109 
1110   return Changed;
1111 }
1112 
1113 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1114                                        SIAtomicScope Scope,
1115                                        SIAtomicAddrSpace AddrSpace,
1116                                        bool IsCrossAddrSpaceOrdering,
1117                                        Position Pos) const {
1118   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1119                     IsCrossAddrSpaceOrdering, Pos);
1120 }
1121 
1122 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1123                                        SIAtomicScope Scope,
1124                                        SIAtomicAddrSpace AddrSpace,
1125                                        Position Pos) const {
1126   if (!InsertCacheInv)
1127     return false;
1128 
1129   bool Changed = false;
1130 
1131   MachineBasicBlock &MBB = *MI->getParent();
1132   DebugLoc DL = MI->getDebugLoc();
1133 
1134   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1135 
1136   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1137                                     ? AMDGPU::BUFFER_WBINVL1
1138                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1139 
1140   if (Pos == Position::AFTER)
1141     ++MI;
1142 
1143   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1144     switch (Scope) {
1145     case SIAtomicScope::SYSTEM:
1146     case SIAtomicScope::AGENT:
1147       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1148       Changed = true;
1149       break;
1150     case SIAtomicScope::WORKGROUP:
1151     case SIAtomicScope::WAVEFRONT:
1152     case SIAtomicScope::SINGLETHREAD:
1153       // No cache to invalidate.
1154       break;
1155     default:
1156       llvm_unreachable("Unsupported synchronization scope");
1157     }
1158   }
1159 
1160   /// The scratch address space does not need the global memory cache
1161   /// to be flushed as all memory operations by the same thread are
1162   /// sequentially consistent, and no other thread can access scratch
1163   /// memory.
1164 
1165   /// Other address spaces do not have a cache.
1166 
1167   if (Pos == Position::AFTER)
1168     --MI;
1169 
1170   return Changed;
1171 }
1172 
1173 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1174     const MachineBasicBlock::iterator &MI,
1175     SIAtomicScope Scope,
1176     SIAtomicAddrSpace AddrSpace) const {
1177   assert(MI->mayLoad() && !MI->mayStore());
1178   bool Changed = false;
1179 
1180   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1181     switch (Scope) {
1182     case SIAtomicScope::SYSTEM:
1183     case SIAtomicScope::AGENT:
1184       // Set the L1 cache policy to MISS_LRU.
1185       // Note: there is no L2 cache bypass policy at the ISA level.
1186       Changed |= enableGLCBit(MI);
1187       break;
1188     case SIAtomicScope::WORKGROUP:
1189       // In threadgroup split mode the waves of a work-group can be executing on
1190       // different CUs. Therefore need to bypass the L1 which is per CU.
1191       // Otherwise in non-threadgroup split mode all waves of a work-group are
1192       // on the same CU, and so the L1 does not need to be bypassed.
1193       if (ST.isTgSplitEnabled())
1194         Changed |= enableGLCBit(MI);
1195       break;
1196     case SIAtomicScope::WAVEFRONT:
1197     case SIAtomicScope::SINGLETHREAD:
1198       // No cache to bypass.
1199       break;
1200     default:
1201       llvm_unreachable("Unsupported synchronization scope");
1202     }
1203   }
1204 
1205   /// The scratch address space does not need the global memory caches
1206   /// to be bypassed as all memory operations by the same thread are
1207   /// sequentially consistent, and no other thread can access scratch
1208   /// memory.
1209 
1210   /// Other address spaces do not have a cache.
1211 
1212   return Changed;
1213 }
1214 
1215 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1216     const MachineBasicBlock::iterator &MI,
1217     SIAtomicScope Scope,
1218     SIAtomicAddrSpace AddrSpace) const {
1219   assert(!MI->mayLoad() && MI->mayStore());
1220   bool Changed = false;
1221 
1222   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1223     switch (Scope) {
1224     case SIAtomicScope::SYSTEM:
1225     case SIAtomicScope::AGENT:
1226       /// Do not set glc for store atomic operations as they implicitly write
1227       /// through the L1 cache.
1228       break;
1229     case SIAtomicScope::WORKGROUP:
1230     case SIAtomicScope::WAVEFRONT:
1231     case SIAtomicScope::SINGLETHREAD:
1232       // No cache to bypass. Store atomics implicitly write through the L1
1233       // cache.
1234       break;
1235     default:
1236       llvm_unreachable("Unsupported synchronization scope");
1237     }
1238   }
1239 
1240   /// The scratch address space does not need the global memory caches
1241   /// to be bypassed as all memory operations by the same thread are
1242   /// sequentially consistent, and no other thread can access scratch
1243   /// memory.
1244 
1245   /// Other address spaces do not have a cache.
1246 
1247   return Changed;
1248 }
1249 
1250 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1251     const MachineBasicBlock::iterator &MI,
1252     SIAtomicScope Scope,
1253     SIAtomicAddrSpace AddrSpace) const {
1254   assert(MI->mayLoad() && MI->mayStore());
1255   bool Changed = false;
1256 
1257   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1258     switch (Scope) {
1259     case SIAtomicScope::SYSTEM:
1260     case SIAtomicScope::AGENT:
1261       /// Do not set glc for RMW atomic operations as they implicitly bypass
1262       /// the L1 cache, and the glc bit is instead used to indicate if they are
1263       /// return or no-return.
1264       break;
1265     case SIAtomicScope::WORKGROUP:
1266     case SIAtomicScope::WAVEFRONT:
1267     case SIAtomicScope::SINGLETHREAD:
1268       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1269       break;
1270     default:
1271       llvm_unreachable("Unsupported synchronization scope");
1272     }
1273   }
1274 
1275   return Changed;
1276 }
1277 
1278 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1279     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1280     bool IsVolatile, bool IsNonTemporal) const {
1281   // Only handle load and store, not atomic read-modify-write insructions. The
1282   // latter use glc to indicate if the atomic returns a result and so must not
1283   // be used for cache control.
1284   assert(MI->mayLoad() ^ MI->mayStore());
1285 
1286   // Only update load and store, not LLVM IR atomic read-modify-write
1287   // instructions. The latter are always marked as volatile so cannot sensibly
1288   // handle it as do not want to pessimize all atomics. Also they do not support
1289   // the nontemporal attribute.
1290   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1291 
1292   bool Changed = false;
1293 
1294   if (IsVolatile) {
1295     // Set L1 cache policy to be MISS_EVICT for load instructions
1296     // and MISS_LRU for store instructions.
1297     // Note: there is no L2 cache bypass policy at the ISA level.
1298     if (Op == SIMemOp::LOAD)
1299       Changed |= enableGLCBit(MI);
1300 
1301     // Ensure operation has completed at system scope to cause all volatile
1302     // operations to be visible outside the program in a global order. Do not
1303     // request cross address space as only the global address space can be
1304     // observable outside the program, so no need to cause a waitcnt for LDS
1305     // address space operations.
1306     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1307                           Position::AFTER);
1308 
1309     return Changed;
1310   }
1311 
1312   if (IsNonTemporal) {
1313     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1314     // for both loads and stores, and the L2 cache policy to STREAM.
1315     Changed |= enableGLCBit(MI);
1316     Changed |= enableSLCBit(MI);
1317     return Changed;
1318   }
1319 
1320   return Changed;
1321 }
1322 
1323 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1324                                       SIAtomicScope Scope,
1325                                       SIAtomicAddrSpace AddrSpace,
1326                                       SIMemOp Op,
1327                                       bool IsCrossAddrSpaceOrdering,
1328                                       Position Pos) const {
1329   if (ST.isTgSplitEnabled()) {
1330     // In threadgroup split mode the waves of a work-group can be executing on
1331     // different CUs. Therefore need to wait for global or GDS memory operations
1332     // to complete to ensure they are visible to waves in the other CUs.
1333     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1334     // the same CU, so no need to wait for global memory as all waves in the
1335     // work-group access the same the L1, nor wait for GDS as access are ordered
1336     // on a CU.
1337     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1338                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1339         (Scope == SIAtomicScope::WORKGROUP)) {
1340       // Same as GFX7 using agent scope.
1341       Scope = SIAtomicScope::AGENT;
1342     }
1343     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1344     // LDS memory operations.
1345     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1346   }
1347   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1348                                         IsCrossAddrSpaceOrdering, Pos);
1349 }
1350 
1351 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1352                                          SIAtomicScope Scope,
1353                                          SIAtomicAddrSpace AddrSpace,
1354                                          Position Pos) const {
1355   if (!InsertCacheInv)
1356     return false;
1357 
1358   bool Changed = false;
1359 
1360   MachineBasicBlock &MBB = *MI->getParent();
1361   DebugLoc DL = MI->getDebugLoc();
1362 
1363   if (Pos == Position::AFTER)
1364     ++MI;
1365 
1366   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1367     switch (Scope) {
1368     case SIAtomicScope::SYSTEM:
1369       // Ensures that following loads will not see stale remote VMEM data or
1370       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1371       // CC will never be stale due to the local memory probes.
1372       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1373       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1374       // hardware does not reorder memory operations by the same wave with
1375       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1376       // remove any cache lines of earlier writes by the same wave and ensures
1377       // later reads by the same wave will refetch the cache lines.
1378       Changed = true;
1379       break;
1380     case SIAtomicScope::AGENT:
1381       // Same as GFX7.
1382       break;
1383     case SIAtomicScope::WORKGROUP:
1384       // In threadgroup split mode the waves of a work-group can be executing on
1385       // different CUs. Therefore need to invalidate the L1 which is per CU.
1386       // Otherwise in non-threadgroup split mode all waves of a work-group are
1387       // on the same CU, and so the L1 does not need to be invalidated.
1388       if (ST.isTgSplitEnabled()) {
1389         // Same as GFX7 using agent scope.
1390         Scope = SIAtomicScope::AGENT;
1391       }
1392       break;
1393     case SIAtomicScope::WAVEFRONT:
1394     case SIAtomicScope::SINGLETHREAD:
1395       // Same as GFX7.
1396       break;
1397     default:
1398       llvm_unreachable("Unsupported synchronization scope");
1399     }
1400   }
1401 
1402   /// The scratch address space does not need the global memory cache
1403   /// to be flushed as all memory operations by the same thread are
1404   /// sequentially consistent, and no other thread can access scratch
1405   /// memory.
1406 
1407   /// Other address spaces do not have a cache.
1408 
1409   if (Pos == Position::AFTER)
1410     --MI;
1411 
1412   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1413 
1414   return Changed;
1415 }
1416 
1417 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1418                                          SIAtomicScope Scope,
1419                                          SIAtomicAddrSpace AddrSpace,
1420                                          bool IsCrossAddrSpaceOrdering,
1421                                          Position Pos) const {
1422   bool Changed = false;
1423 
1424   MachineBasicBlock &MBB = *MI->getParent();
1425   DebugLoc DL = MI->getDebugLoc();
1426 
1427   if (Pos == Position::AFTER)
1428     ++MI;
1429 
1430   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1431     switch (Scope) {
1432     case SIAtomicScope::SYSTEM:
1433       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1434       // hardware does not reorder memory operations by the same wave with
1435       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1436       // to initiate writeback of any dirty cache lines of earlier writes by the
1437       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1438       // writeback has completed.
1439       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1440         // Set SC bits to indicate system scope.
1441         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1442       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1443       // vmcnt(0)" needed by the "BUFFER_WBL2".
1444       Changed = true;
1445       break;
1446     case SIAtomicScope::AGENT:
1447     case SIAtomicScope::WORKGROUP:
1448     case SIAtomicScope::WAVEFRONT:
1449     case SIAtomicScope::SINGLETHREAD:
1450       // Same as GFX7.
1451       break;
1452     default:
1453       llvm_unreachable("Unsupported synchronization scope");
1454     }
1455   }
1456 
1457   if (Pos == Position::AFTER)
1458     --MI;
1459 
1460   Changed |=
1461       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1462                                         IsCrossAddrSpaceOrdering, Pos);
1463 
1464   return Changed;
1465 }
1466 
1467 bool SIGfx940CacheControl::enableLoadCacheBypass(
1468     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1469     SIAtomicAddrSpace AddrSpace) const {
1470   assert(MI->mayLoad() && !MI->mayStore());
1471   bool Changed = false;
1472 
1473   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1474     switch (Scope) {
1475     case SIAtomicScope::SYSTEM:
1476       // Set SC bits to indicate system scope.
1477       Changed |= enableSC0Bit(MI);
1478       Changed |= enableSC1Bit(MI);
1479       break;
1480     case SIAtomicScope::AGENT:
1481       // Set SC bits to indicate agent scope.
1482       Changed |= enableSC1Bit(MI);
1483       break;
1484     case SIAtomicScope::WORKGROUP:
1485       // In threadgroup split mode the waves of a work-group can be executing on
1486       // different CUs. Therefore need to bypass the L1 which is per CU.
1487       // Otherwise in non-threadgroup split mode all waves of a work-group are
1488       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1489       // bits to indicate work-group scope will do this automatically.
1490       Changed |= enableSC0Bit(MI);
1491       break;
1492     case SIAtomicScope::WAVEFRONT:
1493     case SIAtomicScope::SINGLETHREAD:
1494       // Leave SC bits unset to indicate wavefront scope.
1495       break;
1496     default:
1497       llvm_unreachable("Unsupported synchronization scope");
1498     }
1499   }
1500 
1501   /// The scratch address space does not need the global memory caches
1502   /// to be bypassed as all memory operations by the same thread are
1503   /// sequentially consistent, and no other thread can access scratch
1504   /// memory.
1505 
1506   /// Other address spaces do not have a cache.
1507 
1508   return Changed;
1509 }
1510 
1511 bool SIGfx940CacheControl::enableStoreCacheBypass(
1512     const MachineBasicBlock::iterator &MI,
1513     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1514   assert(!MI->mayLoad() && MI->mayStore());
1515   bool Changed = false;
1516 
1517   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1518     switch (Scope) {
1519     case SIAtomicScope::SYSTEM:
1520       // Set SC bits to indicate system scope.
1521       Changed |= enableSC0Bit(MI);
1522       Changed |= enableSC1Bit(MI);
1523       break;
1524     case SIAtomicScope::AGENT:
1525       // Set SC bits to indicate agent scope.
1526       Changed |= enableSC1Bit(MI);
1527       break;
1528     case SIAtomicScope::WORKGROUP:
1529       // Set SC bits to indicate workgroup scope.
1530       Changed |= enableSC0Bit(MI);
1531       break;
1532     case SIAtomicScope::WAVEFRONT:
1533     case SIAtomicScope::SINGLETHREAD:
1534       // Leave SC bits unset to indicate wavefront scope.
1535       break;
1536     default:
1537       llvm_unreachable("Unsupported synchronization scope");
1538     }
1539   }
1540 
1541   /// The scratch address space does not need the global memory caches
1542   /// to be bypassed as all memory operations by the same thread are
1543   /// sequentially consistent, and no other thread can access scratch
1544   /// memory.
1545 
1546   /// Other address spaces do not have a cache.
1547 
1548   return Changed;
1549 }
1550 
1551 bool SIGfx940CacheControl::enableRMWCacheBypass(
1552     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1553     SIAtomicAddrSpace AddrSpace) const {
1554   assert(MI->mayLoad() && MI->mayStore());
1555   bool Changed = false;
1556 
1557   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1558     switch (Scope) {
1559     case SIAtomicScope::SYSTEM:
1560       // Set SC1 bit to indicate system scope.
1561       Changed |= enableSC1Bit(MI);
1562       break;
1563     case SIAtomicScope::AGENT:
1564     case SIAtomicScope::WORKGROUP:
1565     case SIAtomicScope::WAVEFRONT:
1566     case SIAtomicScope::SINGLETHREAD:
1567       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1568       // to indicate system or agent scope. The SC0 bit is used to indicate if
1569       // they are return or no-return. Leave SC1 bit unset to indicate agent
1570       // scope.
1571       break;
1572     default:
1573       llvm_unreachable("Unsupported synchronization scope");
1574     }
1575   }
1576 
1577   return Changed;
1578 }
1579 
1580 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1581     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1582     bool IsVolatile, bool IsNonTemporal) const {
1583   // Only handle load and store, not atomic read-modify-write insructions. The
1584   // latter use glc to indicate if the atomic returns a result and so must not
1585   // be used for cache control.
1586   assert(MI->mayLoad() ^ MI->mayStore());
1587 
1588   // Only update load and store, not LLVM IR atomic read-modify-write
1589   // instructions. The latter are always marked as volatile so cannot sensibly
1590   // handle it as do not want to pessimize all atomics. Also they do not support
1591   // the nontemporal attribute.
1592   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1593 
1594   bool Changed = false;
1595 
1596   if (IsVolatile) {
1597     // Set SC bits to indicate system scope.
1598     Changed |= enableSC0Bit(MI);
1599     Changed |= enableSC1Bit(MI);
1600 
1601     // Ensure operation has completed at system scope to cause all volatile
1602     // operations to be visible outside the program in a global order. Do not
1603     // request cross address space as only the global address space can be
1604     // observable outside the program, so no need to cause a waitcnt for LDS
1605     // address space operations.
1606     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1607                           Position::AFTER);
1608 
1609     return Changed;
1610   }
1611 
1612   if (IsNonTemporal) {
1613     Changed |= enableNTBit(MI);
1614     return Changed;
1615   }
1616 
1617   return Changed;
1618 }
1619 
1620 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1621                                          SIAtomicScope Scope,
1622                                          SIAtomicAddrSpace AddrSpace,
1623                                          Position Pos) const {
1624   if (!InsertCacheInv)
1625     return false;
1626 
1627   bool Changed = false;
1628 
1629   MachineBasicBlock &MBB = *MI->getParent();
1630   DebugLoc DL = MI->getDebugLoc();
1631 
1632   if (Pos == Position::AFTER)
1633     ++MI;
1634 
1635   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1636     switch (Scope) {
1637     case SIAtomicScope::SYSTEM:
1638       // Ensures that following loads will not see stale remote VMEM data or
1639       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1640       // CC will never be stale due to the local memory probes.
1641       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1642           // Set SC bits to indicate system scope.
1643           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1644       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1645       // hardware does not reorder memory operations by the same wave with
1646       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1647       // remove any cache lines of earlier writes by the same wave and ensures
1648       // later reads by the same wave will refetch the cache lines.
1649       Changed = true;
1650       break;
1651     case SIAtomicScope::AGENT:
1652       // Ensures that following loads will not see stale remote date or local
1653       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1654       // due to the memory probes.
1655       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1656           // Set SC bits to indicate agent scope.
1657           .addImm(AMDGPU::CPol::SC1);
1658       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1659       // does not reorder memory operations with respect to preceeding buffer
1660       // invalidate. The invalidate is guaranteed to remove any cache lines of
1661       // earlier writes and ensures later writes will refetch the cache lines.
1662       Changed = true;
1663       break;
1664     case SIAtomicScope::WORKGROUP:
1665       // In threadgroup split mode the waves of a work-group can be executing on
1666       // different CUs. Therefore need to invalidate the L1 which is per CU.
1667       // Otherwise in non-threadgroup split mode all waves of a work-group are
1668       // on the same CU, and so the L1 does not need to be invalidated.
1669       if (ST.isTgSplitEnabled()) {
1670         // Ensures L1 is invalidated if in threadgroup split mode. In
1671         // non-threadgroup split mode it is a NOP, but no point generating it in
1672         // that case if know not in that mode.
1673         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1674             // Set SC bits to indicate work-group scope.
1675             .addImm(AMDGPU::CPol::SC0);
1676         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1677         // does not reorder memory operations with respect to preceeding buffer
1678         // invalidate. The invalidate is guaranteed to remove any cache lines of
1679         // earlier writes and ensures later writes will refetch the cache lines.
1680         Changed = true;
1681       }
1682       break;
1683     case SIAtomicScope::WAVEFRONT:
1684     case SIAtomicScope::SINGLETHREAD:
1685       // Could generate "BUFFER_INV" but it would do nothing as there are no
1686       // caches to invalidate.
1687       break;
1688     default:
1689       llvm_unreachable("Unsupported synchronization scope");
1690     }
1691   }
1692 
1693   /// The scratch address space does not need the global memory cache
1694   /// to be flushed as all memory operations by the same thread are
1695   /// sequentially consistent, and no other thread can access scratch
1696   /// memory.
1697 
1698   /// Other address spaces do not have a cache.
1699 
1700   if (Pos == Position::AFTER)
1701     --MI;
1702 
1703   return Changed;
1704 }
1705 
1706 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1707                                          SIAtomicScope Scope,
1708                                          SIAtomicAddrSpace AddrSpace,
1709                                          bool IsCrossAddrSpaceOrdering,
1710                                          Position Pos) const {
1711   bool Changed = false;
1712 
1713   MachineBasicBlock &MBB = *MI->getParent();
1714   DebugLoc DL = MI->getDebugLoc();
1715 
1716   if (Pos == Position::AFTER)
1717     ++MI;
1718 
1719   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1720     switch (Scope) {
1721     case SIAtomicScope::SYSTEM:
1722       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1723       // hardware does not reorder memory operations by the same wave with
1724       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1725       // to initiate writeback of any dirty cache lines of earlier writes by the
1726       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1727       // writeback has completed.
1728       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1729           // Set SC bits to indicate system scope.
1730           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1731       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1732       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1733       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1734       Changed = true;
1735       break;
1736     case SIAtomicScope::AGENT:
1737       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1738           // Set SC bits to indicate agent scope.
1739           .addImm(AMDGPU::CPol::SC1);
1740 
1741       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1742       // SIAtomicScope::AGENT, the following insertWait will generate the
1743       // required "S_WAITCNT vmcnt(0)".
1744       Changed = true;
1745       break;
1746     case SIAtomicScope::WORKGROUP:
1747     case SIAtomicScope::WAVEFRONT:
1748     case SIAtomicScope::SINGLETHREAD:
1749       // Do not generate "BUFFER_WBL2" as there are no caches it would
1750       // writeback, and would require an otherwise unnecessary
1751       // "S_WAITCNT vmcnt(0)".
1752       break;
1753     default:
1754       llvm_unreachable("Unsupported synchronization scope");
1755     }
1756   }
1757 
1758   if (Pos == Position::AFTER)
1759     --MI;
1760 
1761   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1762   // S_WAITCNT needed.
1763   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1764                         IsCrossAddrSpaceOrdering, Pos);
1765 
1766   return Changed;
1767 }
1768 
1769 bool SIGfx10CacheControl::enableLoadCacheBypass(
1770     const MachineBasicBlock::iterator &MI,
1771     SIAtomicScope Scope,
1772     SIAtomicAddrSpace AddrSpace) const {
1773   assert(MI->mayLoad() && !MI->mayStore());
1774   bool Changed = false;
1775 
1776   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1777     switch (Scope) {
1778     case SIAtomicScope::SYSTEM:
1779     case SIAtomicScope::AGENT:
1780       // Set the L0 and L1 cache policies to MISS_EVICT.
1781       // Note: there is no L2 cache coherent bypass control at the ISA level.
1782       Changed |= enableGLCBit(MI);
1783       Changed |= enableDLCBit(MI);
1784       break;
1785     case SIAtomicScope::WORKGROUP:
1786       // In WGP mode the waves of a work-group can be executing on either CU of
1787       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1788       // CU mode all waves of a work-group are on the same CU, and so the L0
1789       // does not need to be bypassed.
1790       if (!ST.isCuModeEnabled())
1791         Changed |= enableGLCBit(MI);
1792       break;
1793     case SIAtomicScope::WAVEFRONT:
1794     case SIAtomicScope::SINGLETHREAD:
1795       // No cache to bypass.
1796       break;
1797     default:
1798       llvm_unreachable("Unsupported synchronization scope");
1799     }
1800   }
1801 
1802   /// The scratch address space does not need the global memory caches
1803   /// to be bypassed as all memory operations by the same thread are
1804   /// sequentially consistent, and no other thread can access scratch
1805   /// memory.
1806 
1807   /// Other address spaces do not have a cache.
1808 
1809   return Changed;
1810 }
1811 
1812 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1813     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1814     bool IsVolatile, bool IsNonTemporal) const {
1815 
1816   // Only handle load and store, not atomic read-modify-write insructions. The
1817   // latter use glc to indicate if the atomic returns a result and so must not
1818   // be used for cache control.
1819   assert(MI->mayLoad() ^ MI->mayStore());
1820 
1821   // Only update load and store, not LLVM IR atomic read-modify-write
1822   // instructions. The latter are always marked as volatile so cannot sensibly
1823   // handle it as do not want to pessimize all atomics. Also they do not support
1824   // the nontemporal attribute.
1825   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1826 
1827   bool Changed = false;
1828 
1829   if (IsVolatile) {
1830     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1831     // and MISS_LRU for store instructions.
1832     // Note: there is no L2 cache coherent bypass control at the ISA level.
1833     if (Op == SIMemOp::LOAD) {
1834       Changed |= enableGLCBit(MI);
1835       Changed |= enableDLCBit(MI);
1836     }
1837 
1838     // Ensure operation has completed at system scope to cause all volatile
1839     // operations to be visible outside the program in a global order. Do not
1840     // request cross address space as only the global address space can be
1841     // observable outside the program, so no need to cause a waitcnt for LDS
1842     // address space operations.
1843     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1844                           Position::AFTER);
1845     return Changed;
1846   }
1847 
1848   if (IsNonTemporal) {
1849     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1850     // and L2 cache policy to STREAM.
1851     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1852     // to MISS_EVICT and the L2 cache policy to STREAM.
1853     if (Op == SIMemOp::STORE)
1854       Changed |= enableGLCBit(MI);
1855     Changed |= enableSLCBit(MI);
1856 
1857     return Changed;
1858   }
1859 
1860   return Changed;
1861 }
1862 
1863 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1864                                      SIAtomicScope Scope,
1865                                      SIAtomicAddrSpace AddrSpace,
1866                                      SIMemOp Op,
1867                                      bool IsCrossAddrSpaceOrdering,
1868                                      Position Pos) const {
1869   bool Changed = false;
1870 
1871   MachineBasicBlock &MBB = *MI->getParent();
1872   DebugLoc DL = MI->getDebugLoc();
1873 
1874   if (Pos == Position::AFTER)
1875     ++MI;
1876 
1877   bool VMCnt = false;
1878   bool VSCnt = false;
1879   bool LGKMCnt = false;
1880 
1881   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1882       SIAtomicAddrSpace::NONE) {
1883     switch (Scope) {
1884     case SIAtomicScope::SYSTEM:
1885     case SIAtomicScope::AGENT:
1886       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1887         VMCnt |= true;
1888       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1889         VSCnt |= true;
1890       break;
1891     case SIAtomicScope::WORKGROUP:
1892       // In WGP mode the waves of a work-group can be executing on either CU of
1893       // the WGP. Therefore need to wait for operations to complete to ensure
1894       // they are visible to waves in the other CU as the L0 is per CU.
1895       // Otherwise in CU mode and all waves of a work-group are on the same CU
1896       // which shares the same L0.
1897       if (!ST.isCuModeEnabled()) {
1898         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1899           VMCnt |= true;
1900         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1901           VSCnt |= true;
1902       }
1903       break;
1904     case SIAtomicScope::WAVEFRONT:
1905     case SIAtomicScope::SINGLETHREAD:
1906       // The L0 cache keeps all memory operations in order for
1907       // work-items in the same wavefront.
1908       break;
1909     default:
1910       llvm_unreachable("Unsupported synchronization scope");
1911     }
1912   }
1913 
1914   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1915     switch (Scope) {
1916     case SIAtomicScope::SYSTEM:
1917     case SIAtomicScope::AGENT:
1918     case SIAtomicScope::WORKGROUP:
1919       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1920       // not needed as LDS operations for all waves are executed in a total
1921       // global ordering as observed by all waves. Required if also
1922       // synchronizing with global/GDS memory as LDS operations could be
1923       // reordered with respect to later global/GDS memory operations of the
1924       // same wave.
1925       LGKMCnt |= IsCrossAddrSpaceOrdering;
1926       break;
1927     case SIAtomicScope::WAVEFRONT:
1928     case SIAtomicScope::SINGLETHREAD:
1929       // The LDS keeps all memory operations in order for
1930       // the same wavefront.
1931       break;
1932     default:
1933       llvm_unreachable("Unsupported synchronization scope");
1934     }
1935   }
1936 
1937   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1938     switch (Scope) {
1939     case SIAtomicScope::SYSTEM:
1940     case SIAtomicScope::AGENT:
1941       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1942       // is not needed as GDS operations for all waves are executed in a total
1943       // global ordering as observed by all waves. Required if also
1944       // synchronizing with global/LDS memory as GDS operations could be
1945       // reordered with respect to later global/LDS memory operations of the
1946       // same wave.
1947       LGKMCnt |= IsCrossAddrSpaceOrdering;
1948       break;
1949     case SIAtomicScope::WORKGROUP:
1950     case SIAtomicScope::WAVEFRONT:
1951     case SIAtomicScope::SINGLETHREAD:
1952       // The GDS keeps all memory operations in order for
1953       // the same work-group.
1954       break;
1955     default:
1956       llvm_unreachable("Unsupported synchronization scope");
1957     }
1958   }
1959 
1960   if (VMCnt || LGKMCnt) {
1961     unsigned WaitCntImmediate =
1962       AMDGPU::encodeWaitcnt(IV,
1963                             VMCnt ? 0 : getVmcntBitMask(IV),
1964                             getExpcntBitMask(IV),
1965                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1966     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1967     Changed = true;
1968   }
1969 
1970   if (VSCnt) {
1971     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1972       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1973       .addImm(0);
1974     Changed = true;
1975   }
1976 
1977   if (Pos == Position::AFTER)
1978     --MI;
1979 
1980   return Changed;
1981 }
1982 
1983 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1984                                         SIAtomicScope Scope,
1985                                         SIAtomicAddrSpace AddrSpace,
1986                                         Position Pos) const {
1987   if (!InsertCacheInv)
1988     return false;
1989 
1990   bool Changed = false;
1991 
1992   MachineBasicBlock &MBB = *MI->getParent();
1993   DebugLoc DL = MI->getDebugLoc();
1994 
1995   if (Pos == Position::AFTER)
1996     ++MI;
1997 
1998   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1999     switch (Scope) {
2000     case SIAtomicScope::SYSTEM:
2001     case SIAtomicScope::AGENT:
2002       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2003       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2004       Changed = true;
2005       break;
2006     case SIAtomicScope::WORKGROUP:
2007       // In WGP mode the waves of a work-group can be executing on either CU of
2008       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2009       // in CU mode and all waves of a work-group are on the same CU, and so the
2010       // L0 does not need to be invalidated.
2011       if (!ST.isCuModeEnabled()) {
2012         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2013         Changed = true;
2014       }
2015       break;
2016     case SIAtomicScope::WAVEFRONT:
2017     case SIAtomicScope::SINGLETHREAD:
2018       // No cache to invalidate.
2019       break;
2020     default:
2021       llvm_unreachable("Unsupported synchronization scope");
2022     }
2023   }
2024 
2025   /// The scratch address space does not need the global memory cache
2026   /// to be flushed as all memory operations by the same thread are
2027   /// sequentially consistent, and no other thread can access scratch
2028   /// memory.
2029 
2030   /// Other address spaces do not have a cache.
2031 
2032   if (Pos == Position::AFTER)
2033     --MI;
2034 
2035   return Changed;
2036 }
2037 
2038 bool SIGfx11CacheControl::enableLoadCacheBypass(
2039     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2040     SIAtomicAddrSpace AddrSpace) const {
2041   assert(MI->mayLoad() && !MI->mayStore());
2042   bool Changed = false;
2043 
2044   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2045     switch (Scope) {
2046     case SIAtomicScope::SYSTEM:
2047     case SIAtomicScope::AGENT:
2048       // Set the L0 and L1 cache policies to MISS_EVICT.
2049       // Note: there is no L2 cache coherent bypass control at the ISA level.
2050       Changed |= enableGLCBit(MI);
2051       break;
2052     case SIAtomicScope::WORKGROUP:
2053       // In WGP mode the waves of a work-group can be executing on either CU of
2054       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2055       // CU mode all waves of a work-group are on the same CU, and so the L0
2056       // does not need to be bypassed.
2057       if (!ST.isCuModeEnabled())
2058         Changed |= enableGLCBit(MI);
2059       break;
2060     case SIAtomicScope::WAVEFRONT:
2061     case SIAtomicScope::SINGLETHREAD:
2062       // No cache to bypass.
2063       break;
2064     default:
2065       llvm_unreachable("Unsupported synchronization scope");
2066     }
2067   }
2068 
2069   /// The scratch address space does not need the global memory caches
2070   /// to be bypassed as all memory operations by the same thread are
2071   /// sequentially consistent, and no other thread can access scratch
2072   /// memory.
2073 
2074   /// Other address spaces do not have a cache.
2075 
2076   return Changed;
2077 }
2078 
2079 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2080     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2081     bool IsVolatile, bool IsNonTemporal) const {
2082 
2083   // Only handle load and store, not atomic read-modify-write insructions. The
2084   // latter use glc to indicate if the atomic returns a result and so must not
2085   // be used for cache control.
2086   assert(MI->mayLoad() ^ MI->mayStore());
2087 
2088   // Only update load and store, not LLVM IR atomic read-modify-write
2089   // instructions. The latter are always marked as volatile so cannot sensibly
2090   // handle it as do not want to pessimize all atomics. Also they do not support
2091   // the nontemporal attribute.
2092   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2093 
2094   bool Changed = false;
2095 
2096   if (IsVolatile) {
2097     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2098     // and MISS_LRU for store instructions.
2099     // Note: there is no L2 cache coherent bypass control at the ISA level.
2100     if (Op == SIMemOp::LOAD)
2101       Changed |= enableGLCBit(MI);
2102 
2103     // Set MALL NOALLOC for load and store instructions.
2104     Changed |= enableDLCBit(MI);
2105 
2106     // Ensure operation has completed at system scope to cause all volatile
2107     // operations to be visible outside the program in a global order. Do not
2108     // request cross address space as only the global address space can be
2109     // observable outside the program, so no need to cause a waitcnt for LDS
2110     // address space operations.
2111     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2112                           Position::AFTER);
2113     return Changed;
2114   }
2115 
2116   if (IsNonTemporal) {
2117     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2118     // and L2 cache policy to STREAM.
2119     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2120     // to MISS_EVICT and the L2 cache policy to STREAM.
2121     if (Op == SIMemOp::STORE)
2122       Changed |= enableGLCBit(MI);
2123     Changed |= enableSLCBit(MI);
2124 
2125     // Set MALL NOALLOC for load and store instructions.
2126     Changed |= enableDLCBit(MI);
2127     return Changed;
2128   }
2129 
2130   return Changed;
2131 }
2132 
2133 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2134   if (AtomicPseudoMIs.empty())
2135     return false;
2136 
2137   for (auto &MI : AtomicPseudoMIs)
2138     MI->eraseFromParent();
2139 
2140   AtomicPseudoMIs.clear();
2141   return true;
2142 }
2143 
2144 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2145                                    MachineBasicBlock::iterator &MI) {
2146   assert(MI->mayLoad() && !MI->mayStore());
2147 
2148   bool Changed = false;
2149 
2150   if (MOI.isAtomic()) {
2151     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2152         MOI.getOrdering() == AtomicOrdering::Acquire ||
2153         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2154       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2155                                            MOI.getOrderingAddrSpace());
2156     }
2157 
2158     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2159       Changed |= CC->insertWait(MI, MOI.getScope(),
2160                                 MOI.getOrderingAddrSpace(),
2161                                 SIMemOp::LOAD | SIMemOp::STORE,
2162                                 MOI.getIsCrossAddressSpaceOrdering(),
2163                                 Position::BEFORE);
2164 
2165     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2166         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2167       Changed |= CC->insertWait(MI, MOI.getScope(),
2168                                 MOI.getInstrAddrSpace(),
2169                                 SIMemOp::LOAD,
2170                                 MOI.getIsCrossAddressSpaceOrdering(),
2171                                 Position::AFTER);
2172       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2173                                    MOI.getOrderingAddrSpace(),
2174                                    Position::AFTER);
2175     }
2176 
2177     return Changed;
2178   }
2179 
2180   // Atomic instructions already bypass caches to the scope specified by the
2181   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2182   // need additional treatment.
2183   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2184                                                 SIMemOp::LOAD, MOI.isVolatile(),
2185                                                 MOI.isNonTemporal());
2186   return Changed;
2187 }
2188 
2189 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2190                                     MachineBasicBlock::iterator &MI) {
2191   assert(!MI->mayLoad() && MI->mayStore());
2192 
2193   bool Changed = false;
2194 
2195   if (MOI.isAtomic()) {
2196     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2197         MOI.getOrdering() == AtomicOrdering::Release ||
2198         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2199       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2200                                             MOI.getOrderingAddrSpace());
2201     }
2202 
2203     if (MOI.getOrdering() == AtomicOrdering::Release ||
2204         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2205       Changed |= CC->insertRelease(MI, MOI.getScope(),
2206                                    MOI.getOrderingAddrSpace(),
2207                                    MOI.getIsCrossAddressSpaceOrdering(),
2208                                    Position::BEFORE);
2209 
2210     return Changed;
2211   }
2212 
2213   // Atomic instructions already bypass caches to the scope specified by the
2214   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2215   // need additional treatment.
2216   Changed |= CC->enableVolatileAndOrNonTemporal(
2217       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2218       MOI.isNonTemporal());
2219   return Changed;
2220 }
2221 
2222 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2223                                           MachineBasicBlock::iterator &MI) {
2224   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2225 
2226   AtomicPseudoMIs.push_back(MI);
2227   bool Changed = false;
2228 
2229   if (MOI.isAtomic()) {
2230     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2231       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2232                                 SIMemOp::LOAD | SIMemOp::STORE,
2233                                 MOI.getIsCrossAddressSpaceOrdering(),
2234                                 Position::BEFORE);
2235 
2236     if (MOI.getOrdering() == AtomicOrdering::Release ||
2237         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2238         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2239       /// TODO: This relies on a barrier always generating a waitcnt
2240       /// for LDS to ensure it is not reordered with the completion of
2241       /// the proceeding LDS operations. If barrier had a memory
2242       /// ordering and memory scope, then library does not need to
2243       /// generate a fence. Could add support in this file for
2244       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2245       /// adding S_WAITCNT before a S_BARRIER.
2246       Changed |= CC->insertRelease(MI, MOI.getScope(),
2247                                    MOI.getOrderingAddrSpace(),
2248                                    MOI.getIsCrossAddressSpaceOrdering(),
2249                                    Position::BEFORE);
2250 
2251     // TODO: If both release and invalidate are happening they could be combined
2252     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2253     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2254     // track cache invalidate and write back instructions.
2255 
2256     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2257         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2258         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2259       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2260                                    MOI.getOrderingAddrSpace(),
2261                                    Position::BEFORE);
2262 
2263     return Changed;
2264   }
2265 
2266   return Changed;
2267 }
2268 
2269 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2270   MachineBasicBlock::iterator &MI) {
2271   assert(MI->mayLoad() && MI->mayStore());
2272 
2273   bool Changed = false;
2274 
2275   if (MOI.isAtomic()) {
2276     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2277         MOI.getOrdering() == AtomicOrdering::Acquire ||
2278         MOI.getOrdering() == AtomicOrdering::Release ||
2279         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2280         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2281       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2282                                           MOI.getInstrAddrSpace());
2283     }
2284 
2285     if (MOI.getOrdering() == AtomicOrdering::Release ||
2286         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2287         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2288         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2289       Changed |= CC->insertRelease(MI, MOI.getScope(),
2290                                    MOI.getOrderingAddrSpace(),
2291                                    MOI.getIsCrossAddressSpaceOrdering(),
2292                                    Position::BEFORE);
2293 
2294     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2295         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2296         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2297         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2298         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2299       Changed |= CC->insertWait(MI, MOI.getScope(),
2300                                 MOI.getInstrAddrSpace(),
2301                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2302                                                    SIMemOp::STORE,
2303                                 MOI.getIsCrossAddressSpaceOrdering(),
2304                                 Position::AFTER);
2305       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2306                                    MOI.getOrderingAddrSpace(),
2307                                    Position::AFTER);
2308     }
2309 
2310     return Changed;
2311   }
2312 
2313   return Changed;
2314 }
2315 
2316 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2317   bool Changed = false;
2318 
2319   SIMemOpAccess MOA(MF);
2320   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2321 
2322   for (auto &MBB : MF) {
2323     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2324 
2325       // Unbundle instructions after the post-RA scheduler.
2326       if (MI->isBundle() && MI->mayLoadOrStore()) {
2327         MachineBasicBlock::instr_iterator II(MI->getIterator());
2328         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2329              I != E && I->isBundledWithPred(); ++I) {
2330           I->unbundleFromPred();
2331           for (MachineOperand &MO : I->operands())
2332             if (MO.isReg())
2333               MO.setIsInternalRead(false);
2334         }
2335 
2336         MI->eraseFromParent();
2337         MI = II->getIterator();
2338       }
2339 
2340       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2341         continue;
2342 
2343       if (const auto &MOI = MOA.getLoadInfo(MI))
2344         Changed |= expandLoad(*MOI, MI);
2345       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2346         Changed |= expandStore(*MOI, MI);
2347         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2348       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2349         Changed |= expandAtomicFence(*MOI, MI);
2350       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2351         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2352     }
2353   }
2354 
2355   Changed |= removeAtomicPseudoMIs();
2356   return Changed;
2357 }
2358 
2359 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2360 
2361 char SIMemoryLegalizer::ID = 0;
2362 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2363 
2364 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2365   return new SIMemoryLegalizer();
2366 }
2367