1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37 namespace {
38
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52 BEFORE,
53 AFTER
54 };
55
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
61 WORKGROUP,
62 AGENT,
63 SYSTEM
64 };
65
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87
88 class SIMemOpInfo final {
89 private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106 bool IsCrossAddressSpaceOrdering = true,
107 AtomicOrdering FailureOrdering =
108 AtomicOrdering::SequentiallyConsistent,
109 bool IsVolatile = false,
110 bool IsNonTemporal = false)
111 : Ordering(Ordering), FailureOrdering(FailureOrdering),
112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113 InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115 IsVolatile(IsVolatile),
116 IsNonTemporal(IsNonTemporal) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155 public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
getScope() const158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
getOrdering() const164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
getFailureOrdering() const170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SIMemOpInfo.
getInstrAddrSpace() const176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SIMemOpInfo.
getOrderingAddrSpace() const182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210 };
211
212 class SIMemOpAccess final {
213 private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 std::optional<SIMemOpInfo>
233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234
235 public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241 std::optional<SIMemOpInfo>
242 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243
244 /// \returns Store info if \p MI is a store operation, "std::nullopt"
245 /// otherwise.
246 std::optional<SIMemOpInfo>
247 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248
249 /// \returns Atomic fence info if \p MI is an atomic fence operation,
250 /// "std::nullopt" otherwise.
251 std::optional<SIMemOpInfo>
252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253
254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255 /// rmw operation, "std::nullopt" otherwise.
256 std::optional<SIMemOpInfo>
257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259
260 class SICacheControl {
261 protected:
262
263 /// AMDGPU subtarget info.
264 const GCNSubtarget &ST;
265
266 /// Instruction info.
267 const SIInstrInfo *TII = nullptr;
268
269 IsaVersion IV;
270
271 /// Whether to insert cache invalidating instructions.
272 bool InsertCacheInv;
273
274 SICacheControl(const GCNSubtarget &ST);
275
276 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277 /// \returns Returns true if \p MI is modified, false otherwise.
278 bool enableNamedBit(const MachineBasicBlock::iterator MI,
279 AMDGPU::CPol::CPol Bit) const;
280
281 public:
282
283 /// Create a cache control for the subtarget \p ST.
284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285
286 /// Update \p MI memory load instruction to bypass any caches up to
287 /// the \p Scope memory scope for address spaces \p
288 /// AddrSpace. Return true iff the instruction was modified.
289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290 SIAtomicScope Scope,
291 SIAtomicAddrSpace AddrSpace) const = 0;
292
293 /// Update \p MI memory store instruction to bypass any caches up to
294 /// the \p Scope memory scope for address spaces \p
295 /// AddrSpace. Return true iff the instruction was modified.
296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297 SIAtomicScope Scope,
298 SIAtomicAddrSpace AddrSpace) const = 0;
299
300 /// Update \p MI memory read-modify-write instruction to bypass any caches up
301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302 /// iff the instruction was modified.
303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304 SIAtomicScope Scope,
305 SIAtomicAddrSpace AddrSpace) const = 0;
306
307 /// Update \p MI memory instruction of kind \p Op associated with address
308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309 /// true iff the instruction was modified.
310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311 SIAtomicAddrSpace AddrSpace,
312 SIMemOp Op, bool IsVolatile,
313 bool IsNonTemporal) const = 0;
314
315 /// Inserts any necessary instructions at position \p Pos relative
316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317 /// \p Op associated with address spaces \p AddrSpace have completed. Used
318 /// between memory instructions to enforce the order they become visible as
319 /// observed by other memory instructions executing in memory scope \p Scope.
320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321 /// address spaces. Returns true iff any instructions inserted.
322 virtual bool insertWait(MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace,
325 SIMemOp Op,
326 bool IsCrossAddrSpaceOrdering,
327 Position Pos) const = 0;
328
329 /// Inserts any necessary instructions at position \p Pos relative to
330 /// instruction \p MI to ensure any subsequent memory instructions of this
331 /// thread with address spaces \p AddrSpace will observe the previous memory
332 /// operations by any thread for memory scopes up to memory scope \p Scope .
333 /// Returns true iff any instructions inserted.
334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335 SIAtomicScope Scope,
336 SIAtomicAddrSpace AddrSpace,
337 Position Pos) const = 0;
338
339 /// Inserts any necessary instructions at position \p Pos relative to
340 /// instruction \p MI to ensure previous memory instructions by this thread
341 /// with address spaces \p AddrSpace have completed and can be observed by
342 /// subsequent memory instructions by any thread executing in memory scope \p
343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344 /// between address spaces. Returns true iff any instructions inserted.
345 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 bool IsCrossAddrSpaceOrdering,
349 Position Pos) const = 0;
350
351 /// Virtual destructor to allow derivations to be deleted.
352 virtual ~SICacheControl() = default;
353
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
355 MachineBasicBlock::iterator &MI) const {
356 return false;
357 }
358 };
359
360 class SIGfx6CacheControl : public SICacheControl {
361 protected:
362
363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366 return enableNamedBit(MI, AMDGPU::CPol::GLC);
367 }
368
369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372 return enableNamedBit(MI, AMDGPU::CPol::SLC);
373 }
374
375 public:
376
SIGfx6CacheControl(const GCNSubtarget & ST)377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378
379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const override;
382
383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace) const override;
386
387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388 SIAtomicScope Scope,
389 SIAtomicAddrSpace AddrSpace) const override;
390
391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393 bool IsVolatile,
394 bool IsNonTemporal) const override;
395
396 bool insertWait(MachineBasicBlock::iterator &MI,
397 SIAtomicScope Scope,
398 SIAtomicAddrSpace AddrSpace,
399 SIMemOp Op,
400 bool IsCrossAddrSpaceOrdering,
401 Position Pos) const override;
402
403 bool insertAcquire(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 Position Pos) const override;
407
408 bool insertRelease(MachineBasicBlock::iterator &MI,
409 SIAtomicScope Scope,
410 SIAtomicAddrSpace AddrSpace,
411 bool IsCrossAddrSpaceOrdering,
412 Position Pos) const override;
413 };
414
415 class SIGfx7CacheControl : public SIGfx6CacheControl {
416 public:
417
SIGfx7CacheControl(const GCNSubtarget & ST)418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419
420 bool insertAcquire(MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace,
423 Position Pos) const override;
424
425 };
426
427 class SIGfx90ACacheControl : public SIGfx7CacheControl {
428 public:
429
SIGfx90ACacheControl(const GCNSubtarget & ST)430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431
432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace) const override;
435
436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437 SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace) const override;
439
440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace) const override;
443
444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446 bool IsVolatile,
447 bool IsNonTemporal) const override;
448
449 bool insertWait(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 SIMemOp Op,
453 bool IsCrossAddrSpaceOrdering,
454 Position Pos) const override;
455
456 bool insertAcquire(MachineBasicBlock::iterator &MI,
457 SIAtomicScope Scope,
458 SIAtomicAddrSpace AddrSpace,
459 Position Pos) const override;
460
461 bool insertRelease(MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace,
464 bool IsCrossAddrSpaceOrdering,
465 Position Pos) const override;
466 };
467
468 class SIGfx940CacheControl : public SIGfx90ACacheControl {
469 protected:
470
471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472 /// is modified, false otherwise.
enableSC0Bit(const MachineBasicBlock::iterator & MI) const473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474 return enableNamedBit(MI, AMDGPU::CPol::SC0);
475 }
476
477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478 /// is modified, false otherwise.
enableSC1Bit(const MachineBasicBlock::iterator & MI) const479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480 return enableNamedBit(MI, AMDGPU::CPol::SC1);
481 }
482
483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484 /// is modified, false otherwise.
enableNTBit(const MachineBasicBlock::iterator & MI) const485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486 return enableNamedBit(MI, AMDGPU::CPol::NT);
487 }
488
489 public:
490
SIGfx940CacheControl(const GCNSubtarget & ST)491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492
493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494 SIAtomicScope Scope,
495 SIAtomicAddrSpace AddrSpace) const override;
496
497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace) const override;
500
501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502 SIAtomicScope Scope,
503 SIAtomicAddrSpace AddrSpace) const override;
504
505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507 bool IsVolatile,
508 bool IsNonTemporal) const override;
509
510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512
513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515 Position Pos) const override;
516
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518 MachineBasicBlock::iterator &MI) const override {
519 bool Changed = false;
520 if (ST.hasForceStoreSC0SC1() &&
521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522 SIAtomicAddrSpace::GLOBAL |
523 SIAtomicAddrSpace::OTHER)) !=
524 SIAtomicAddrSpace::NONE) {
525 Changed |= enableSC0Bit(MI);
526 Changed |= enableSC1Bit(MI);
527 }
528 return Changed;
529 }
530 };
531
532 class SIGfx10CacheControl : public SIGfx7CacheControl {
533 protected:
534
535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538 return enableNamedBit(MI, AMDGPU::CPol::DLC);
539 }
540
541 public:
542
SIGfx10CacheControl(const GCNSubtarget & ST)543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544
545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546 SIAtomicScope Scope,
547 SIAtomicAddrSpace AddrSpace) const override;
548
549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551 bool IsVolatile,
552 bool IsNonTemporal) const override;
553
554 bool insertWait(MachineBasicBlock::iterator &MI,
555 SIAtomicScope Scope,
556 SIAtomicAddrSpace AddrSpace,
557 SIMemOp Op,
558 bool IsCrossAddrSpaceOrdering,
559 Position Pos) const override;
560
561 bool insertAcquire(MachineBasicBlock::iterator &MI,
562 SIAtomicScope Scope,
563 SIAtomicAddrSpace AddrSpace,
564 Position Pos) const override;
565 };
566
567 class SIGfx11CacheControl : public SIGfx10CacheControl {
568 public:
SIGfx11CacheControl(const GCNSubtarget & ST)569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570
571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572 SIAtomicScope Scope,
573 SIAtomicAddrSpace AddrSpace) const override;
574
575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577 bool IsVolatile,
578 bool IsNonTemporal) const override;
579 };
580
581 class SIGfx12CacheControl : public SIGfx11CacheControl {
582 protected:
583 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
584 // \returns Returns true if \p MI is modified, false otherwise.
585 bool setTH(const MachineBasicBlock::iterator MI,
586 AMDGPU::CPol::CPol Value) const;
587 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
588 // MI. \returns Returns true if \p MI is modified, false otherwise.
589 bool setScope(const MachineBasicBlock::iterator MI,
590 AMDGPU::CPol::CPol Value) const;
591
592 public:
SIGfx12CacheControl(const GCNSubtarget & ST)593 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
594
595 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
596 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
597 bool IsCrossAddrSpaceOrdering, Position Pos) const override;
598
599 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
600 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
601
602 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
603 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
604 bool IsVolatile,
605 bool IsNonTemporal) const override;
606 };
607
608 class SIMemoryLegalizer final : public MachineFunctionPass {
609 private:
610
611 /// Cache Control.
612 std::unique_ptr<SICacheControl> CC = nullptr;
613
614 /// List of atomic pseudo instructions.
615 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
616
617 /// Return true iff instruction \p MI is a atomic instruction that
618 /// returns a result.
isAtomicRet(const MachineInstr & MI) const619 bool isAtomicRet(const MachineInstr &MI) const {
620 return SIInstrInfo::isAtomicRet(MI);
621 }
622
623 /// Removes all processed atomic pseudo instructions from the current
624 /// function. Returns true if current function is modified, false otherwise.
625 bool removeAtomicPseudoMIs();
626
627 /// Expands load operation \p MI. Returns true if instructions are
628 /// added/deleted or \p MI is modified, false otherwise.
629 bool expandLoad(const SIMemOpInfo &MOI,
630 MachineBasicBlock::iterator &MI);
631 /// Expands store operation \p MI. Returns true if instructions are
632 /// added/deleted or \p MI is modified, false otherwise.
633 bool expandStore(const SIMemOpInfo &MOI,
634 MachineBasicBlock::iterator &MI);
635 /// Expands atomic fence operation \p MI. Returns true if
636 /// instructions are added/deleted or \p MI is modified, false otherwise.
637 bool expandAtomicFence(const SIMemOpInfo &MOI,
638 MachineBasicBlock::iterator &MI);
639 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
640 /// instructions are added/deleted or \p MI is modified, false otherwise.
641 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
642 MachineBasicBlock::iterator &MI);
643
644 public:
645 static char ID;
646
SIMemoryLegalizer()647 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
648
getAnalysisUsage(AnalysisUsage & AU) const649 void getAnalysisUsage(AnalysisUsage &AU) const override {
650 AU.setPreservesCFG();
651 MachineFunctionPass::getAnalysisUsage(AU);
652 }
653
getPassName() const654 StringRef getPassName() const override {
655 return PASS_NAME;
656 }
657
658 bool runOnMachineFunction(MachineFunction &MF) override;
659 };
660
661 } // end namespace anonymous
662
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const663 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
664 const char *Msg) const {
665 const Function &Func = MI->getParent()->getParent()->getFunction();
666 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
667 Func.getContext().diagnose(Diag);
668 }
669
670 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const671 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
672 SIAtomicAddrSpace InstrAddrSpace) const {
673 if (SSID == SyncScope::System)
674 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
675 if (SSID == MMI->getAgentSSID())
676 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
677 if (SSID == MMI->getWorkgroupSSID())
678 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
679 true);
680 if (SSID == MMI->getWavefrontSSID())
681 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
682 true);
683 if (SSID == SyncScope::SingleThread)
684 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
685 true);
686 if (SSID == MMI->getSystemOneAddressSpaceSSID())
687 return std::tuple(SIAtomicScope::SYSTEM,
688 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
689 if (SSID == MMI->getAgentOneAddressSpaceSSID())
690 return std::tuple(SIAtomicScope::AGENT,
691 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
692 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
693 return std::tuple(SIAtomicScope::WORKGROUP,
694 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
695 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
696 return std::tuple(SIAtomicScope::WAVEFRONT,
697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
699 return std::tuple(SIAtomicScope::SINGLETHREAD,
700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701 return std::nullopt;
702 }
703
toSIAtomicAddrSpace(unsigned AS) const704 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
705 if (AS == AMDGPUAS::FLAT_ADDRESS)
706 return SIAtomicAddrSpace::FLAT;
707 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
708 return SIAtomicAddrSpace::GLOBAL;
709 if (AS == AMDGPUAS::LOCAL_ADDRESS)
710 return SIAtomicAddrSpace::LDS;
711 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
712 return SIAtomicAddrSpace::SCRATCH;
713 if (AS == AMDGPUAS::REGION_ADDRESS)
714 return SIAtomicAddrSpace::GDS;
715
716 return SIAtomicAddrSpace::OTHER;
717 }
718
SIMemOpAccess(MachineFunction & MF)719 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
720 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
721 }
722
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const723 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
724 const MachineBasicBlock::iterator &MI) const {
725 assert(MI->getNumMemOperands() > 0);
726
727 SyncScope::ID SSID = SyncScope::SingleThread;
728 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
729 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
730 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
731 bool IsNonTemporal = true;
732 bool IsVolatile = false;
733
734 // Validator should check whether or not MMOs cover the entire set of
735 // locations accessed by the memory instruction.
736 for (const auto &MMO : MI->memoperands()) {
737 IsNonTemporal &= MMO->isNonTemporal();
738 IsVolatile |= MMO->isVolatile();
739 InstrAddrSpace |=
740 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
741 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
742 if (OpOrdering != AtomicOrdering::NotAtomic) {
743 const auto &IsSyncScopeInclusion =
744 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
745 if (!IsSyncScopeInclusion) {
746 reportUnsupported(MI,
747 "Unsupported non-inclusive atomic synchronization scope");
748 return std::nullopt;
749 }
750
751 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
752 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
753 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
754 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
755 FailureOrdering =
756 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
757 }
758 }
759
760 SIAtomicScope Scope = SIAtomicScope::NONE;
761 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
762 bool IsCrossAddressSpaceOrdering = false;
763 if (Ordering != AtomicOrdering::NotAtomic) {
764 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
765 if (!ScopeOrNone) {
766 reportUnsupported(MI, "Unsupported atomic synchronization scope");
767 return std::nullopt;
768 }
769 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
770 *ScopeOrNone;
771 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
772 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
773 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
774 reportUnsupported(MI, "Unsupported atomic address space");
775 return std::nullopt;
776 }
777 }
778 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
779 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
780 IsNonTemporal);
781 }
782
783 std::optional<SIMemOpInfo>
getLoadInfo(const MachineBasicBlock::iterator & MI) const784 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
785 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
786
787 if (!(MI->mayLoad() && !MI->mayStore()))
788 return std::nullopt;
789
790 // Be conservative if there are no memory operands.
791 if (MI->getNumMemOperands() == 0)
792 return SIMemOpInfo();
793
794 return constructFromMIWithMMO(MI);
795 }
796
797 std::optional<SIMemOpInfo>
getStoreInfo(const MachineBasicBlock::iterator & MI) const798 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
799 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
800
801 if (!(!MI->mayLoad() && MI->mayStore()))
802 return std::nullopt;
803
804 // Be conservative if there are no memory operands.
805 if (MI->getNumMemOperands() == 0)
806 return SIMemOpInfo();
807
808 return constructFromMIWithMMO(MI);
809 }
810
811 std::optional<SIMemOpInfo>
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const812 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
813 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
814
815 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
816 return std::nullopt;
817
818 AtomicOrdering Ordering =
819 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
820
821 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
822 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
823 if (!ScopeOrNone) {
824 reportUnsupported(MI, "Unsupported atomic synchronization scope");
825 return std::nullopt;
826 }
827
828 SIAtomicScope Scope = SIAtomicScope::NONE;
829 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
830 bool IsCrossAddressSpaceOrdering = false;
831 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
832 *ScopeOrNone;
833
834 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
835 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
836 reportUnsupported(MI, "Unsupported atomic address space");
837 return std::nullopt;
838 }
839
840 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
841 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
842 }
843
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const844 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
845 const MachineBasicBlock::iterator &MI) const {
846 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
847
848 if (!(MI->mayLoad() && MI->mayStore()))
849 return std::nullopt;
850
851 // Be conservative if there are no memory operands.
852 if (MI->getNumMemOperands() == 0)
853 return SIMemOpInfo();
854
855 return constructFromMIWithMMO(MI);
856 }
857
SICacheControl(const GCNSubtarget & ST)858 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
859 TII = ST.getInstrInfo();
860 IV = getIsaVersion(ST.getCPU());
861 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
862 }
863
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const864 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
865 AMDGPU::CPol::CPol Bit) const {
866 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
867 if (!CPol)
868 return false;
869
870 CPol->setImm(CPol->getImm() | Bit);
871 return true;
872 }
873
874 /* static */
create(const GCNSubtarget & ST)875 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
876 GCNSubtarget::Generation Generation = ST.getGeneration();
877 if (ST.hasGFX940Insts())
878 return std::make_unique<SIGfx940CacheControl>(ST);
879 if (ST.hasGFX90AInsts())
880 return std::make_unique<SIGfx90ACacheControl>(ST);
881 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
882 return std::make_unique<SIGfx6CacheControl>(ST);
883 if (Generation < AMDGPUSubtarget::GFX10)
884 return std::make_unique<SIGfx7CacheControl>(ST);
885 if (Generation < AMDGPUSubtarget::GFX11)
886 return std::make_unique<SIGfx10CacheControl>(ST);
887 if (Generation < AMDGPUSubtarget::GFX12)
888 return std::make_unique<SIGfx11CacheControl>(ST);
889 return std::make_unique<SIGfx12CacheControl>(ST);
890 }
891
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const892 bool SIGfx6CacheControl::enableLoadCacheBypass(
893 const MachineBasicBlock::iterator &MI,
894 SIAtomicScope Scope,
895 SIAtomicAddrSpace AddrSpace) const {
896 assert(MI->mayLoad() && !MI->mayStore());
897 bool Changed = false;
898
899 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
900 switch (Scope) {
901 case SIAtomicScope::SYSTEM:
902 case SIAtomicScope::AGENT:
903 // Set L1 cache policy to MISS_EVICT.
904 // Note: there is no L2 cache bypass policy at the ISA level.
905 Changed |= enableGLCBit(MI);
906 break;
907 case SIAtomicScope::WORKGROUP:
908 case SIAtomicScope::WAVEFRONT:
909 case SIAtomicScope::SINGLETHREAD:
910 // No cache to bypass.
911 break;
912 default:
913 llvm_unreachable("Unsupported synchronization scope");
914 }
915 }
916
917 /// The scratch address space does not need the global memory caches
918 /// to be bypassed as all memory operations by the same thread are
919 /// sequentially consistent, and no other thread can access scratch
920 /// memory.
921
922 /// Other address spaces do not have a cache.
923
924 return Changed;
925 }
926
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const927 bool SIGfx6CacheControl::enableStoreCacheBypass(
928 const MachineBasicBlock::iterator &MI,
929 SIAtomicScope Scope,
930 SIAtomicAddrSpace AddrSpace) const {
931 assert(!MI->mayLoad() && MI->mayStore());
932 bool Changed = false;
933
934 /// The L1 cache is write through so does not need to be bypassed. There is no
935 /// bypass control for the L2 cache at the isa level.
936
937 return Changed;
938 }
939
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const940 bool SIGfx6CacheControl::enableRMWCacheBypass(
941 const MachineBasicBlock::iterator &MI,
942 SIAtomicScope Scope,
943 SIAtomicAddrSpace AddrSpace) const {
944 assert(MI->mayLoad() && MI->mayStore());
945 bool Changed = false;
946
947 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
948 /// bypassed, and the GLC bit is instead used to indicate if they are
949 /// return or no-return.
950 /// Note: there is no L2 cache coherent bypass control at the ISA level.
951
952 return Changed;
953 }
954
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const955 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
956 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
957 bool IsVolatile, bool IsNonTemporal) const {
958 // Only handle load and store, not atomic read-modify-write insructions. The
959 // latter use glc to indicate if the atomic returns a result and so must not
960 // be used for cache control.
961 assert(MI->mayLoad() ^ MI->mayStore());
962
963 // Only update load and store, not LLVM IR atomic read-modify-write
964 // instructions. The latter are always marked as volatile so cannot sensibly
965 // handle it as do not want to pessimize all atomics. Also they do not support
966 // the nontemporal attribute.
967 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
968
969 bool Changed = false;
970
971 if (IsVolatile) {
972 // Set L1 cache policy to be MISS_EVICT for load instructions
973 // and MISS_LRU for store instructions.
974 // Note: there is no L2 cache bypass policy at the ISA level.
975 if (Op == SIMemOp::LOAD)
976 Changed |= enableGLCBit(MI);
977
978 // Ensure operation has completed at system scope to cause all volatile
979 // operations to be visible outside the program in a global order. Do not
980 // request cross address space as only the global address space can be
981 // observable outside the program, so no need to cause a waitcnt for LDS
982 // address space operations.
983 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
984 Position::AFTER);
985
986 return Changed;
987 }
988
989 if (IsNonTemporal) {
990 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
991 // for both loads and stores, and the L2 cache policy to STREAM.
992 Changed |= enableGLCBit(MI);
993 Changed |= enableSLCBit(MI);
994 return Changed;
995 }
996
997 return Changed;
998 }
999
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1000 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1001 SIAtomicScope Scope,
1002 SIAtomicAddrSpace AddrSpace,
1003 SIMemOp Op,
1004 bool IsCrossAddrSpaceOrdering,
1005 Position Pos) const {
1006 bool Changed = false;
1007
1008 MachineBasicBlock &MBB = *MI->getParent();
1009 DebugLoc DL = MI->getDebugLoc();
1010
1011 if (Pos == Position::AFTER)
1012 ++MI;
1013
1014 bool VMCnt = false;
1015 bool LGKMCnt = false;
1016
1017 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1018 SIAtomicAddrSpace::NONE) {
1019 switch (Scope) {
1020 case SIAtomicScope::SYSTEM:
1021 case SIAtomicScope::AGENT:
1022 VMCnt |= true;
1023 break;
1024 case SIAtomicScope::WORKGROUP:
1025 case SIAtomicScope::WAVEFRONT:
1026 case SIAtomicScope::SINGLETHREAD:
1027 // The L1 cache keeps all memory operations in order for
1028 // wavefronts in the same work-group.
1029 break;
1030 default:
1031 llvm_unreachable("Unsupported synchronization scope");
1032 }
1033 }
1034
1035 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1036 switch (Scope) {
1037 case SIAtomicScope::SYSTEM:
1038 case SIAtomicScope::AGENT:
1039 case SIAtomicScope::WORKGROUP:
1040 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1041 // not needed as LDS operations for all waves are executed in a total
1042 // global ordering as observed by all waves. Required if also
1043 // synchronizing with global/GDS memory as LDS operations could be
1044 // reordered with respect to later global/GDS memory operations of the
1045 // same wave.
1046 LGKMCnt |= IsCrossAddrSpaceOrdering;
1047 break;
1048 case SIAtomicScope::WAVEFRONT:
1049 case SIAtomicScope::SINGLETHREAD:
1050 // The LDS keeps all memory operations in order for
1051 // the same wavefront.
1052 break;
1053 default:
1054 llvm_unreachable("Unsupported synchronization scope");
1055 }
1056 }
1057
1058 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1059 switch (Scope) {
1060 case SIAtomicScope::SYSTEM:
1061 case SIAtomicScope::AGENT:
1062 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1063 // is not needed as GDS operations for all waves are executed in a total
1064 // global ordering as observed by all waves. Required if also
1065 // synchronizing with global/LDS memory as GDS operations could be
1066 // reordered with respect to later global/LDS memory operations of the
1067 // same wave.
1068 LGKMCnt |= IsCrossAddrSpaceOrdering;
1069 break;
1070 case SIAtomicScope::WORKGROUP:
1071 case SIAtomicScope::WAVEFRONT:
1072 case SIAtomicScope::SINGLETHREAD:
1073 // The GDS keeps all memory operations in order for
1074 // the same work-group.
1075 break;
1076 default:
1077 llvm_unreachable("Unsupported synchronization scope");
1078 }
1079 }
1080
1081 if (VMCnt || LGKMCnt) {
1082 unsigned WaitCntImmediate =
1083 AMDGPU::encodeWaitcnt(IV,
1084 VMCnt ? 0 : getVmcntBitMask(IV),
1085 getExpcntBitMask(IV),
1086 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1088 .addImm(WaitCntImmediate);
1089 Changed = true;
1090 }
1091
1092 if (Pos == Position::AFTER)
1093 --MI;
1094
1095 return Changed;
1096 }
1097
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1098 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1099 SIAtomicScope Scope,
1100 SIAtomicAddrSpace AddrSpace,
1101 Position Pos) const {
1102 if (!InsertCacheInv)
1103 return false;
1104
1105 bool Changed = false;
1106
1107 MachineBasicBlock &MBB = *MI->getParent();
1108 DebugLoc DL = MI->getDebugLoc();
1109
1110 if (Pos == Position::AFTER)
1111 ++MI;
1112
1113 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1114 switch (Scope) {
1115 case SIAtomicScope::SYSTEM:
1116 case SIAtomicScope::AGENT:
1117 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1118 Changed = true;
1119 break;
1120 case SIAtomicScope::WORKGROUP:
1121 case SIAtomicScope::WAVEFRONT:
1122 case SIAtomicScope::SINGLETHREAD:
1123 // No cache to invalidate.
1124 break;
1125 default:
1126 llvm_unreachable("Unsupported synchronization scope");
1127 }
1128 }
1129
1130 /// The scratch address space does not need the global memory cache
1131 /// to be flushed as all memory operations by the same thread are
1132 /// sequentially consistent, and no other thread can access scratch
1133 /// memory.
1134
1135 /// Other address spaces do not have a cache.
1136
1137 if (Pos == Position::AFTER)
1138 --MI;
1139
1140 return Changed;
1141 }
1142
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1143 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1144 SIAtomicScope Scope,
1145 SIAtomicAddrSpace AddrSpace,
1146 bool IsCrossAddrSpaceOrdering,
1147 Position Pos) const {
1148 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1149 IsCrossAddrSpaceOrdering, Pos);
1150 }
1151
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1152 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1153 SIAtomicScope Scope,
1154 SIAtomicAddrSpace AddrSpace,
1155 Position Pos) const {
1156 if (!InsertCacheInv)
1157 return false;
1158
1159 bool Changed = false;
1160
1161 MachineBasicBlock &MBB = *MI->getParent();
1162 DebugLoc DL = MI->getDebugLoc();
1163
1164 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1165
1166 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1167 ? AMDGPU::BUFFER_WBINVL1
1168 : AMDGPU::BUFFER_WBINVL1_VOL;
1169
1170 if (Pos == Position::AFTER)
1171 ++MI;
1172
1173 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174 switch (Scope) {
1175 case SIAtomicScope::SYSTEM:
1176 case SIAtomicScope::AGENT:
1177 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1178 Changed = true;
1179 break;
1180 case SIAtomicScope::WORKGROUP:
1181 case SIAtomicScope::WAVEFRONT:
1182 case SIAtomicScope::SINGLETHREAD:
1183 // No cache to invalidate.
1184 break;
1185 default:
1186 llvm_unreachable("Unsupported synchronization scope");
1187 }
1188 }
1189
1190 /// The scratch address space does not need the global memory cache
1191 /// to be flushed as all memory operations by the same thread are
1192 /// sequentially consistent, and no other thread can access scratch
1193 /// memory.
1194
1195 /// Other address spaces do not have a cache.
1196
1197 if (Pos == Position::AFTER)
1198 --MI;
1199
1200 return Changed;
1201 }
1202
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1203 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1204 const MachineBasicBlock::iterator &MI,
1205 SIAtomicScope Scope,
1206 SIAtomicAddrSpace AddrSpace) const {
1207 assert(MI->mayLoad() && !MI->mayStore());
1208 bool Changed = false;
1209
1210 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1211 switch (Scope) {
1212 case SIAtomicScope::SYSTEM:
1213 case SIAtomicScope::AGENT:
1214 // Set the L1 cache policy to MISS_LRU.
1215 // Note: there is no L2 cache bypass policy at the ISA level.
1216 Changed |= enableGLCBit(MI);
1217 break;
1218 case SIAtomicScope::WORKGROUP:
1219 // In threadgroup split mode the waves of a work-group can be executing on
1220 // different CUs. Therefore need to bypass the L1 which is per CU.
1221 // Otherwise in non-threadgroup split mode all waves of a work-group are
1222 // on the same CU, and so the L1 does not need to be bypassed.
1223 if (ST.isTgSplitEnabled())
1224 Changed |= enableGLCBit(MI);
1225 break;
1226 case SIAtomicScope::WAVEFRONT:
1227 case SIAtomicScope::SINGLETHREAD:
1228 // No cache to bypass.
1229 break;
1230 default:
1231 llvm_unreachable("Unsupported synchronization scope");
1232 }
1233 }
1234
1235 /// The scratch address space does not need the global memory caches
1236 /// to be bypassed as all memory operations by the same thread are
1237 /// sequentially consistent, and no other thread can access scratch
1238 /// memory.
1239
1240 /// Other address spaces do not have a cache.
1241
1242 return Changed;
1243 }
1244
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1245 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1246 const MachineBasicBlock::iterator &MI,
1247 SIAtomicScope Scope,
1248 SIAtomicAddrSpace AddrSpace) const {
1249 assert(!MI->mayLoad() && MI->mayStore());
1250 bool Changed = false;
1251
1252 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1253 switch (Scope) {
1254 case SIAtomicScope::SYSTEM:
1255 case SIAtomicScope::AGENT:
1256 /// Do not set glc for store atomic operations as they implicitly write
1257 /// through the L1 cache.
1258 break;
1259 case SIAtomicScope::WORKGROUP:
1260 case SIAtomicScope::WAVEFRONT:
1261 case SIAtomicScope::SINGLETHREAD:
1262 // No cache to bypass. Store atomics implicitly write through the L1
1263 // cache.
1264 break;
1265 default:
1266 llvm_unreachable("Unsupported synchronization scope");
1267 }
1268 }
1269
1270 /// The scratch address space does not need the global memory caches
1271 /// to be bypassed as all memory operations by the same thread are
1272 /// sequentially consistent, and no other thread can access scratch
1273 /// memory.
1274
1275 /// Other address spaces do not have a cache.
1276
1277 return Changed;
1278 }
1279
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1280 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1281 const MachineBasicBlock::iterator &MI,
1282 SIAtomicScope Scope,
1283 SIAtomicAddrSpace AddrSpace) const {
1284 assert(MI->mayLoad() && MI->mayStore());
1285 bool Changed = false;
1286
1287 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1288 switch (Scope) {
1289 case SIAtomicScope::SYSTEM:
1290 case SIAtomicScope::AGENT:
1291 /// Do not set glc for RMW atomic operations as they implicitly bypass
1292 /// the L1 cache, and the glc bit is instead used to indicate if they are
1293 /// return or no-return.
1294 break;
1295 case SIAtomicScope::WORKGROUP:
1296 case SIAtomicScope::WAVEFRONT:
1297 case SIAtomicScope::SINGLETHREAD:
1298 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1299 break;
1300 default:
1301 llvm_unreachable("Unsupported synchronization scope");
1302 }
1303 }
1304
1305 return Changed;
1306 }
1307
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1308 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1309 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1310 bool IsVolatile, bool IsNonTemporal) const {
1311 // Only handle load and store, not atomic read-modify-write insructions. The
1312 // latter use glc to indicate if the atomic returns a result and so must not
1313 // be used for cache control.
1314 assert(MI->mayLoad() ^ MI->mayStore());
1315
1316 // Only update load and store, not LLVM IR atomic read-modify-write
1317 // instructions. The latter are always marked as volatile so cannot sensibly
1318 // handle it as do not want to pessimize all atomics. Also they do not support
1319 // the nontemporal attribute.
1320 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1321
1322 bool Changed = false;
1323
1324 if (IsVolatile) {
1325 // Set L1 cache policy to be MISS_EVICT for load instructions
1326 // and MISS_LRU for store instructions.
1327 // Note: there is no L2 cache bypass policy at the ISA level.
1328 if (Op == SIMemOp::LOAD)
1329 Changed |= enableGLCBit(MI);
1330
1331 // Ensure operation has completed at system scope to cause all volatile
1332 // operations to be visible outside the program in a global order. Do not
1333 // request cross address space as only the global address space can be
1334 // observable outside the program, so no need to cause a waitcnt for LDS
1335 // address space operations.
1336 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1337 Position::AFTER);
1338
1339 return Changed;
1340 }
1341
1342 if (IsNonTemporal) {
1343 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1344 // for both loads and stores, and the L2 cache policy to STREAM.
1345 Changed |= enableGLCBit(MI);
1346 Changed |= enableSLCBit(MI);
1347 return Changed;
1348 }
1349
1350 return Changed;
1351 }
1352
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1353 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1354 SIAtomicScope Scope,
1355 SIAtomicAddrSpace AddrSpace,
1356 SIMemOp Op,
1357 bool IsCrossAddrSpaceOrdering,
1358 Position Pos) const {
1359 if (ST.isTgSplitEnabled()) {
1360 // In threadgroup split mode the waves of a work-group can be executing on
1361 // different CUs. Therefore need to wait for global or GDS memory operations
1362 // to complete to ensure they are visible to waves in the other CUs.
1363 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1364 // the same CU, so no need to wait for global memory as all waves in the
1365 // work-group access the same the L1, nor wait for GDS as access are ordered
1366 // on a CU.
1367 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1368 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1369 (Scope == SIAtomicScope::WORKGROUP)) {
1370 // Same as GFX7 using agent scope.
1371 Scope = SIAtomicScope::AGENT;
1372 }
1373 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1374 // LDS memory operations.
1375 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1376 }
1377 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1378 IsCrossAddrSpaceOrdering, Pos);
1379 }
1380
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1381 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1382 SIAtomicScope Scope,
1383 SIAtomicAddrSpace AddrSpace,
1384 Position Pos) const {
1385 if (!InsertCacheInv)
1386 return false;
1387
1388 bool Changed = false;
1389
1390 MachineBasicBlock &MBB = *MI->getParent();
1391 DebugLoc DL = MI->getDebugLoc();
1392
1393 if (Pos == Position::AFTER)
1394 ++MI;
1395
1396 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1397 switch (Scope) {
1398 case SIAtomicScope::SYSTEM:
1399 // Ensures that following loads will not see stale remote VMEM data or
1400 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1401 // CC will never be stale due to the local memory probes.
1402 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1403 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1404 // hardware does not reorder memory operations by the same wave with
1405 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1406 // remove any cache lines of earlier writes by the same wave and ensures
1407 // later reads by the same wave will refetch the cache lines.
1408 Changed = true;
1409 break;
1410 case SIAtomicScope::AGENT:
1411 // Same as GFX7.
1412 break;
1413 case SIAtomicScope::WORKGROUP:
1414 // In threadgroup split mode the waves of a work-group can be executing on
1415 // different CUs. Therefore need to invalidate the L1 which is per CU.
1416 // Otherwise in non-threadgroup split mode all waves of a work-group are
1417 // on the same CU, and so the L1 does not need to be invalidated.
1418 if (ST.isTgSplitEnabled()) {
1419 // Same as GFX7 using agent scope.
1420 Scope = SIAtomicScope::AGENT;
1421 }
1422 break;
1423 case SIAtomicScope::WAVEFRONT:
1424 case SIAtomicScope::SINGLETHREAD:
1425 // Same as GFX7.
1426 break;
1427 default:
1428 llvm_unreachable("Unsupported synchronization scope");
1429 }
1430 }
1431
1432 /// The scratch address space does not need the global memory cache
1433 /// to be flushed as all memory operations by the same thread are
1434 /// sequentially consistent, and no other thread can access scratch
1435 /// memory.
1436
1437 /// Other address spaces do not have a cache.
1438
1439 if (Pos == Position::AFTER)
1440 --MI;
1441
1442 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1443
1444 return Changed;
1445 }
1446
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1447 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1448 SIAtomicScope Scope,
1449 SIAtomicAddrSpace AddrSpace,
1450 bool IsCrossAddrSpaceOrdering,
1451 Position Pos) const {
1452 bool Changed = false;
1453
1454 MachineBasicBlock &MBB = *MI->getParent();
1455 const DebugLoc &DL = MI->getDebugLoc();
1456
1457 if (Pos == Position::AFTER)
1458 ++MI;
1459
1460 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1461 switch (Scope) {
1462 case SIAtomicScope::SYSTEM:
1463 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1464 // hardware does not reorder memory operations by the same wave with
1465 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1466 // to initiate writeback of any dirty cache lines of earlier writes by the
1467 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1468 // writeback has completed.
1469 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1470 // Set SC bits to indicate system scope.
1471 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1472 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1473 // vmcnt(0)" needed by the "BUFFER_WBL2".
1474 Changed = true;
1475 break;
1476 case SIAtomicScope::AGENT:
1477 case SIAtomicScope::WORKGROUP:
1478 case SIAtomicScope::WAVEFRONT:
1479 case SIAtomicScope::SINGLETHREAD:
1480 // Same as GFX7.
1481 break;
1482 default:
1483 llvm_unreachable("Unsupported synchronization scope");
1484 }
1485 }
1486
1487 if (Pos == Position::AFTER)
1488 --MI;
1489
1490 Changed |=
1491 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1492 IsCrossAddrSpaceOrdering, Pos);
1493
1494 return Changed;
1495 }
1496
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1497 bool SIGfx940CacheControl::enableLoadCacheBypass(
1498 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1499 SIAtomicAddrSpace AddrSpace) const {
1500 assert(MI->mayLoad() && !MI->mayStore());
1501 bool Changed = false;
1502
1503 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1504 switch (Scope) {
1505 case SIAtomicScope::SYSTEM:
1506 // Set SC bits to indicate system scope.
1507 Changed |= enableSC0Bit(MI);
1508 Changed |= enableSC1Bit(MI);
1509 break;
1510 case SIAtomicScope::AGENT:
1511 // Set SC bits to indicate agent scope.
1512 Changed |= enableSC1Bit(MI);
1513 break;
1514 case SIAtomicScope::WORKGROUP:
1515 // In threadgroup split mode the waves of a work-group can be executing on
1516 // different CUs. Therefore need to bypass the L1 which is per CU.
1517 // Otherwise in non-threadgroup split mode all waves of a work-group are
1518 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1519 // bits to indicate work-group scope will do this automatically.
1520 Changed |= enableSC0Bit(MI);
1521 break;
1522 case SIAtomicScope::WAVEFRONT:
1523 case SIAtomicScope::SINGLETHREAD:
1524 // Leave SC bits unset to indicate wavefront scope.
1525 break;
1526 default:
1527 llvm_unreachable("Unsupported synchronization scope");
1528 }
1529 }
1530
1531 /// The scratch address space does not need the global memory caches
1532 /// to be bypassed as all memory operations by the same thread are
1533 /// sequentially consistent, and no other thread can access scratch
1534 /// memory.
1535
1536 /// Other address spaces do not have a cache.
1537
1538 return Changed;
1539 }
1540
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1541 bool SIGfx940CacheControl::enableStoreCacheBypass(
1542 const MachineBasicBlock::iterator &MI,
1543 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1544 assert(!MI->mayLoad() && MI->mayStore());
1545 bool Changed = false;
1546
1547 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1548 switch (Scope) {
1549 case SIAtomicScope::SYSTEM:
1550 // Set SC bits to indicate system scope.
1551 Changed |= enableSC0Bit(MI);
1552 Changed |= enableSC1Bit(MI);
1553 break;
1554 case SIAtomicScope::AGENT:
1555 // Set SC bits to indicate agent scope.
1556 Changed |= enableSC1Bit(MI);
1557 break;
1558 case SIAtomicScope::WORKGROUP:
1559 // Set SC bits to indicate workgroup scope.
1560 Changed |= enableSC0Bit(MI);
1561 break;
1562 case SIAtomicScope::WAVEFRONT:
1563 case SIAtomicScope::SINGLETHREAD:
1564 // Leave SC bits unset to indicate wavefront scope.
1565 break;
1566 default:
1567 llvm_unreachable("Unsupported synchronization scope");
1568 }
1569 }
1570
1571 /// The scratch address space does not need the global memory caches
1572 /// to be bypassed as all memory operations by the same thread are
1573 /// sequentially consistent, and no other thread can access scratch
1574 /// memory.
1575
1576 /// Other address spaces do not have a cache.
1577
1578 return Changed;
1579 }
1580
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1581 bool SIGfx940CacheControl::enableRMWCacheBypass(
1582 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1583 SIAtomicAddrSpace AddrSpace) const {
1584 assert(MI->mayLoad() && MI->mayStore());
1585 bool Changed = false;
1586
1587 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1588 switch (Scope) {
1589 case SIAtomicScope::SYSTEM:
1590 // Set SC1 bit to indicate system scope.
1591 Changed |= enableSC1Bit(MI);
1592 break;
1593 case SIAtomicScope::AGENT:
1594 case SIAtomicScope::WORKGROUP:
1595 case SIAtomicScope::WAVEFRONT:
1596 case SIAtomicScope::SINGLETHREAD:
1597 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1598 // to indicate system or agent scope. The SC0 bit is used to indicate if
1599 // they are return or no-return. Leave SC1 bit unset to indicate agent
1600 // scope.
1601 break;
1602 default:
1603 llvm_unreachable("Unsupported synchronization scope");
1604 }
1605 }
1606
1607 return Changed;
1608 }
1609
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1610 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1611 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1612 bool IsVolatile, bool IsNonTemporal) const {
1613 // Only handle load and store, not atomic read-modify-write insructions. The
1614 // latter use glc to indicate if the atomic returns a result and so must not
1615 // be used for cache control.
1616 assert(MI->mayLoad() ^ MI->mayStore());
1617
1618 // Only update load and store, not LLVM IR atomic read-modify-write
1619 // instructions. The latter are always marked as volatile so cannot sensibly
1620 // handle it as do not want to pessimize all atomics. Also they do not support
1621 // the nontemporal attribute.
1622 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1623
1624 bool Changed = false;
1625
1626 if (IsVolatile) {
1627 // Set SC bits to indicate system scope.
1628 Changed |= enableSC0Bit(MI);
1629 Changed |= enableSC1Bit(MI);
1630
1631 // Ensure operation has completed at system scope to cause all volatile
1632 // operations to be visible outside the program in a global order. Do not
1633 // request cross address space as only the global address space can be
1634 // observable outside the program, so no need to cause a waitcnt for LDS
1635 // address space operations.
1636 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1637 Position::AFTER);
1638
1639 return Changed;
1640 }
1641
1642 if (IsNonTemporal) {
1643 Changed |= enableNTBit(MI);
1644 return Changed;
1645 }
1646
1647 return Changed;
1648 }
1649
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1650 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1651 SIAtomicScope Scope,
1652 SIAtomicAddrSpace AddrSpace,
1653 Position Pos) const {
1654 if (!InsertCacheInv)
1655 return false;
1656
1657 bool Changed = false;
1658
1659 MachineBasicBlock &MBB = *MI->getParent();
1660 DebugLoc DL = MI->getDebugLoc();
1661
1662 if (Pos == Position::AFTER)
1663 ++MI;
1664
1665 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1666 switch (Scope) {
1667 case SIAtomicScope::SYSTEM:
1668 // Ensures that following loads will not see stale remote VMEM data or
1669 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1670 // CC will never be stale due to the local memory probes.
1671 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1672 // Set SC bits to indicate system scope.
1673 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1674 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1675 // hardware does not reorder memory operations by the same wave with
1676 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1677 // remove any cache lines of earlier writes by the same wave and ensures
1678 // later reads by the same wave will refetch the cache lines.
1679 Changed = true;
1680 break;
1681 case SIAtomicScope::AGENT:
1682 // Ensures that following loads will not see stale remote date or local
1683 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1684 // due to the memory probes.
1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1686 // Set SC bits to indicate agent scope.
1687 .addImm(AMDGPU::CPol::SC1);
1688 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1689 // does not reorder memory operations with respect to preceeding buffer
1690 // invalidate. The invalidate is guaranteed to remove any cache lines of
1691 // earlier writes and ensures later writes will refetch the cache lines.
1692 Changed = true;
1693 break;
1694 case SIAtomicScope::WORKGROUP:
1695 // In threadgroup split mode the waves of a work-group can be executing on
1696 // different CUs. Therefore need to invalidate the L1 which is per CU.
1697 // Otherwise in non-threadgroup split mode all waves of a work-group are
1698 // on the same CU, and so the L1 does not need to be invalidated.
1699 if (ST.isTgSplitEnabled()) {
1700 // Ensures L1 is invalidated if in threadgroup split mode. In
1701 // non-threadgroup split mode it is a NOP, but no point generating it in
1702 // that case if know not in that mode.
1703 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1704 // Set SC bits to indicate work-group scope.
1705 .addImm(AMDGPU::CPol::SC0);
1706 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1707 // does not reorder memory operations with respect to preceeding buffer
1708 // invalidate. The invalidate is guaranteed to remove any cache lines of
1709 // earlier writes and ensures later writes will refetch the cache lines.
1710 Changed = true;
1711 }
1712 break;
1713 case SIAtomicScope::WAVEFRONT:
1714 case SIAtomicScope::SINGLETHREAD:
1715 // Could generate "BUFFER_INV" but it would do nothing as there are no
1716 // caches to invalidate.
1717 break;
1718 default:
1719 llvm_unreachable("Unsupported synchronization scope");
1720 }
1721 }
1722
1723 /// The scratch address space does not need the global memory cache
1724 /// to be flushed as all memory operations by the same thread are
1725 /// sequentially consistent, and no other thread can access scratch
1726 /// memory.
1727
1728 /// Other address spaces do not have a cache.
1729
1730 if (Pos == Position::AFTER)
1731 --MI;
1732
1733 return Changed;
1734 }
1735
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1736 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1737 SIAtomicScope Scope,
1738 SIAtomicAddrSpace AddrSpace,
1739 bool IsCrossAddrSpaceOrdering,
1740 Position Pos) const {
1741 bool Changed = false;
1742
1743 MachineBasicBlock &MBB = *MI->getParent();
1744 DebugLoc DL = MI->getDebugLoc();
1745
1746 if (Pos == Position::AFTER)
1747 ++MI;
1748
1749 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1750 switch (Scope) {
1751 case SIAtomicScope::SYSTEM:
1752 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1753 // hardware does not reorder memory operations by the same wave with
1754 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1755 // to initiate writeback of any dirty cache lines of earlier writes by the
1756 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1757 // writeback has completed.
1758 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1759 // Set SC bits to indicate system scope.
1760 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1761 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1762 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1763 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1764 Changed = true;
1765 break;
1766 case SIAtomicScope::AGENT:
1767 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1768 // Set SC bits to indicate agent scope.
1769 .addImm(AMDGPU::CPol::SC1);
1770
1771 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1772 // SIAtomicScope::AGENT, the following insertWait will generate the
1773 // required "S_WAITCNT vmcnt(0)".
1774 Changed = true;
1775 break;
1776 case SIAtomicScope::WORKGROUP:
1777 case SIAtomicScope::WAVEFRONT:
1778 case SIAtomicScope::SINGLETHREAD:
1779 // Do not generate "BUFFER_WBL2" as there are no caches it would
1780 // writeback, and would require an otherwise unnecessary
1781 // "S_WAITCNT vmcnt(0)".
1782 break;
1783 default:
1784 llvm_unreachable("Unsupported synchronization scope");
1785 }
1786 }
1787
1788 if (Pos == Position::AFTER)
1789 --MI;
1790
1791 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1792 // S_WAITCNT needed.
1793 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1794 IsCrossAddrSpaceOrdering, Pos);
1795
1796 return Changed;
1797 }
1798
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1799 bool SIGfx10CacheControl::enableLoadCacheBypass(
1800 const MachineBasicBlock::iterator &MI,
1801 SIAtomicScope Scope,
1802 SIAtomicAddrSpace AddrSpace) const {
1803 assert(MI->mayLoad() && !MI->mayStore());
1804 bool Changed = false;
1805
1806 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1807 switch (Scope) {
1808 case SIAtomicScope::SYSTEM:
1809 case SIAtomicScope::AGENT:
1810 // Set the L0 and L1 cache policies to MISS_EVICT.
1811 // Note: there is no L2 cache coherent bypass control at the ISA level.
1812 Changed |= enableGLCBit(MI);
1813 Changed |= enableDLCBit(MI);
1814 break;
1815 case SIAtomicScope::WORKGROUP:
1816 // In WGP mode the waves of a work-group can be executing on either CU of
1817 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1818 // CU mode all waves of a work-group are on the same CU, and so the L0
1819 // does not need to be bypassed.
1820 if (!ST.isCuModeEnabled())
1821 Changed |= enableGLCBit(MI);
1822 break;
1823 case SIAtomicScope::WAVEFRONT:
1824 case SIAtomicScope::SINGLETHREAD:
1825 // No cache to bypass.
1826 break;
1827 default:
1828 llvm_unreachable("Unsupported synchronization scope");
1829 }
1830 }
1831
1832 /// The scratch address space does not need the global memory caches
1833 /// to be bypassed as all memory operations by the same thread are
1834 /// sequentially consistent, and no other thread can access scratch
1835 /// memory.
1836
1837 /// Other address spaces do not have a cache.
1838
1839 return Changed;
1840 }
1841
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1842 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1843 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1844 bool IsVolatile, bool IsNonTemporal) const {
1845
1846 // Only handle load and store, not atomic read-modify-write insructions. The
1847 // latter use glc to indicate if the atomic returns a result and so must not
1848 // be used for cache control.
1849 assert(MI->mayLoad() ^ MI->mayStore());
1850
1851 // Only update load and store, not LLVM IR atomic read-modify-write
1852 // instructions. The latter are always marked as volatile so cannot sensibly
1853 // handle it as do not want to pessimize all atomics. Also they do not support
1854 // the nontemporal attribute.
1855 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1856
1857 bool Changed = false;
1858
1859 if (IsVolatile) {
1860 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1861 // and MISS_LRU for store instructions.
1862 // Note: there is no L2 cache coherent bypass control at the ISA level.
1863 if (Op == SIMemOp::LOAD) {
1864 Changed |= enableGLCBit(MI);
1865 Changed |= enableDLCBit(MI);
1866 }
1867
1868 // Ensure operation has completed at system scope to cause all volatile
1869 // operations to be visible outside the program in a global order. Do not
1870 // request cross address space as only the global address space can be
1871 // observable outside the program, so no need to cause a waitcnt for LDS
1872 // address space operations.
1873 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1874 Position::AFTER);
1875 return Changed;
1876 }
1877
1878 if (IsNonTemporal) {
1879 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1880 // and L2 cache policy to STREAM.
1881 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1882 // to MISS_EVICT and the L2 cache policy to STREAM.
1883 if (Op == SIMemOp::STORE)
1884 Changed |= enableGLCBit(MI);
1885 Changed |= enableSLCBit(MI);
1886
1887 return Changed;
1888 }
1889
1890 return Changed;
1891 }
1892
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1893 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1894 SIAtomicScope Scope,
1895 SIAtomicAddrSpace AddrSpace,
1896 SIMemOp Op,
1897 bool IsCrossAddrSpaceOrdering,
1898 Position Pos) const {
1899 bool Changed = false;
1900
1901 MachineBasicBlock &MBB = *MI->getParent();
1902 DebugLoc DL = MI->getDebugLoc();
1903
1904 if (Pos == Position::AFTER)
1905 ++MI;
1906
1907 bool VMCnt = false;
1908 bool VSCnt = false;
1909 bool LGKMCnt = false;
1910
1911 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1912 SIAtomicAddrSpace::NONE) {
1913 switch (Scope) {
1914 case SIAtomicScope::SYSTEM:
1915 case SIAtomicScope::AGENT:
1916 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1917 VMCnt |= true;
1918 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1919 VSCnt |= true;
1920 break;
1921 case SIAtomicScope::WORKGROUP:
1922 // In WGP mode the waves of a work-group can be executing on either CU of
1923 // the WGP. Therefore need to wait for operations to complete to ensure
1924 // they are visible to waves in the other CU as the L0 is per CU.
1925 // Otherwise in CU mode and all waves of a work-group are on the same CU
1926 // which shares the same L0.
1927 if (!ST.isCuModeEnabled()) {
1928 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1929 VMCnt |= true;
1930 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1931 VSCnt |= true;
1932 }
1933 break;
1934 case SIAtomicScope::WAVEFRONT:
1935 case SIAtomicScope::SINGLETHREAD:
1936 // The L0 cache keeps all memory operations in order for
1937 // work-items in the same wavefront.
1938 break;
1939 default:
1940 llvm_unreachable("Unsupported synchronization scope");
1941 }
1942 }
1943
1944 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1945 switch (Scope) {
1946 case SIAtomicScope::SYSTEM:
1947 case SIAtomicScope::AGENT:
1948 case SIAtomicScope::WORKGROUP:
1949 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1950 // not needed as LDS operations for all waves are executed in a total
1951 // global ordering as observed by all waves. Required if also
1952 // synchronizing with global/GDS memory as LDS operations could be
1953 // reordered with respect to later global/GDS memory operations of the
1954 // same wave.
1955 LGKMCnt |= IsCrossAddrSpaceOrdering;
1956 break;
1957 case SIAtomicScope::WAVEFRONT:
1958 case SIAtomicScope::SINGLETHREAD:
1959 // The LDS keeps all memory operations in order for
1960 // the same wavefront.
1961 break;
1962 default:
1963 llvm_unreachable("Unsupported synchronization scope");
1964 }
1965 }
1966
1967 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1968 switch (Scope) {
1969 case SIAtomicScope::SYSTEM:
1970 case SIAtomicScope::AGENT:
1971 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1972 // is not needed as GDS operations for all waves are executed in a total
1973 // global ordering as observed by all waves. Required if also
1974 // synchronizing with global/LDS memory as GDS operations could be
1975 // reordered with respect to later global/LDS memory operations of the
1976 // same wave.
1977 LGKMCnt |= IsCrossAddrSpaceOrdering;
1978 break;
1979 case SIAtomicScope::WORKGROUP:
1980 case SIAtomicScope::WAVEFRONT:
1981 case SIAtomicScope::SINGLETHREAD:
1982 // The GDS keeps all memory operations in order for
1983 // the same work-group.
1984 break;
1985 default:
1986 llvm_unreachable("Unsupported synchronization scope");
1987 }
1988 }
1989
1990 if (VMCnt || LGKMCnt) {
1991 unsigned WaitCntImmediate =
1992 AMDGPU::encodeWaitcnt(IV,
1993 VMCnt ? 0 : getVmcntBitMask(IV),
1994 getExpcntBitMask(IV),
1995 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1997 .addImm(WaitCntImmediate);
1998 Changed = true;
1999 }
2000
2001 if (VSCnt) {
2002 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2003 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2004 .addImm(0);
2005 Changed = true;
2006 }
2007
2008 if (Pos == Position::AFTER)
2009 --MI;
2010
2011 return Changed;
2012 }
2013
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const2014 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2015 SIAtomicScope Scope,
2016 SIAtomicAddrSpace AddrSpace,
2017 Position Pos) const {
2018 if (!InsertCacheInv)
2019 return false;
2020
2021 bool Changed = false;
2022
2023 MachineBasicBlock &MBB = *MI->getParent();
2024 DebugLoc DL = MI->getDebugLoc();
2025
2026 if (Pos == Position::AFTER)
2027 ++MI;
2028
2029 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2030 switch (Scope) {
2031 case SIAtomicScope::SYSTEM:
2032 case SIAtomicScope::AGENT:
2033 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2034 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2035 Changed = true;
2036 break;
2037 case SIAtomicScope::WORKGROUP:
2038 // In WGP mode the waves of a work-group can be executing on either CU of
2039 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2040 // in CU mode and all waves of a work-group are on the same CU, and so the
2041 // L0 does not need to be invalidated.
2042 if (!ST.isCuModeEnabled()) {
2043 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2044 Changed = true;
2045 }
2046 break;
2047 case SIAtomicScope::WAVEFRONT:
2048 case SIAtomicScope::SINGLETHREAD:
2049 // No cache to invalidate.
2050 break;
2051 default:
2052 llvm_unreachable("Unsupported synchronization scope");
2053 }
2054 }
2055
2056 /// The scratch address space does not need the global memory cache
2057 /// to be flushed as all memory operations by the same thread are
2058 /// sequentially consistent, and no other thread can access scratch
2059 /// memory.
2060
2061 /// Other address spaces do not have a cache.
2062
2063 if (Pos == Position::AFTER)
2064 --MI;
2065
2066 return Changed;
2067 }
2068
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2069 bool SIGfx11CacheControl::enableLoadCacheBypass(
2070 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2071 SIAtomicAddrSpace AddrSpace) const {
2072 assert(MI->mayLoad() && !MI->mayStore());
2073 bool Changed = false;
2074
2075 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2076 switch (Scope) {
2077 case SIAtomicScope::SYSTEM:
2078 case SIAtomicScope::AGENT:
2079 // Set the L0 and L1 cache policies to MISS_EVICT.
2080 // Note: there is no L2 cache coherent bypass control at the ISA level.
2081 Changed |= enableGLCBit(MI);
2082 break;
2083 case SIAtomicScope::WORKGROUP:
2084 // In WGP mode the waves of a work-group can be executing on either CU of
2085 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2086 // CU mode all waves of a work-group are on the same CU, and so the L0
2087 // does not need to be bypassed.
2088 if (!ST.isCuModeEnabled())
2089 Changed |= enableGLCBit(MI);
2090 break;
2091 case SIAtomicScope::WAVEFRONT:
2092 case SIAtomicScope::SINGLETHREAD:
2093 // No cache to bypass.
2094 break;
2095 default:
2096 llvm_unreachable("Unsupported synchronization scope");
2097 }
2098 }
2099
2100 /// The scratch address space does not need the global memory caches
2101 /// to be bypassed as all memory operations by the same thread are
2102 /// sequentially consistent, and no other thread can access scratch
2103 /// memory.
2104
2105 /// Other address spaces do not have a cache.
2106
2107 return Changed;
2108 }
2109
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const2110 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2111 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2112 bool IsVolatile, bool IsNonTemporal) const {
2113
2114 // Only handle load and store, not atomic read-modify-write insructions. The
2115 // latter use glc to indicate if the atomic returns a result and so must not
2116 // be used for cache control.
2117 assert(MI->mayLoad() ^ MI->mayStore());
2118
2119 // Only update load and store, not LLVM IR atomic read-modify-write
2120 // instructions. The latter are always marked as volatile so cannot sensibly
2121 // handle it as do not want to pessimize all atomics. Also they do not support
2122 // the nontemporal attribute.
2123 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2124
2125 bool Changed = false;
2126
2127 if (IsVolatile) {
2128 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2129 // and MISS_LRU for store instructions.
2130 // Note: there is no L2 cache coherent bypass control at the ISA level.
2131 if (Op == SIMemOp::LOAD)
2132 Changed |= enableGLCBit(MI);
2133
2134 // Set MALL NOALLOC for load and store instructions.
2135 Changed |= enableDLCBit(MI);
2136
2137 // Ensure operation has completed at system scope to cause all volatile
2138 // operations to be visible outside the program in a global order. Do not
2139 // request cross address space as only the global address space can be
2140 // observable outside the program, so no need to cause a waitcnt for LDS
2141 // address space operations.
2142 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2143 Position::AFTER);
2144 return Changed;
2145 }
2146
2147 if (IsNonTemporal) {
2148 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2149 // and L2 cache policy to STREAM.
2150 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2151 // to MISS_EVICT and the L2 cache policy to STREAM.
2152 if (Op == SIMemOp::STORE)
2153 Changed |= enableGLCBit(MI);
2154 Changed |= enableSLCBit(MI);
2155
2156 // Set MALL NOALLOC for load and store instructions.
2157 Changed |= enableDLCBit(MI);
2158 return Changed;
2159 }
2160
2161 return Changed;
2162 }
2163
setTH(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const2164 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2165 AMDGPU::CPol::CPol Value) const {
2166 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2167 if (!CPol)
2168 return false;
2169
2170 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2171 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2172 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2173 return true;
2174 }
2175
2176 return false;
2177 }
2178
setScope(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const2179 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2180 AMDGPU::CPol::CPol Value) const {
2181 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2182 if (!CPol)
2183 return false;
2184
2185 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2186 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2187 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2188 return true;
2189 }
2190
2191 return false;
2192 }
2193
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const2194 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2195 SIAtomicScope Scope,
2196 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2197 bool IsCrossAddrSpaceOrdering,
2198 Position Pos) const {
2199 bool Changed = false;
2200
2201 MachineBasicBlock &MBB = *MI->getParent();
2202 DebugLoc DL = MI->getDebugLoc();
2203
2204 bool LOADCnt = false;
2205 bool DSCnt = false;
2206 bool STORECnt = false;
2207
2208 if (Pos == Position::AFTER)
2209 ++MI;
2210
2211 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2212 SIAtomicAddrSpace::NONE) {
2213 switch (Scope) {
2214 case SIAtomicScope::SYSTEM:
2215 case SIAtomicScope::AGENT:
2216 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2217 LOADCnt |= true;
2218 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2219 STORECnt |= true;
2220 break;
2221 case SIAtomicScope::WORKGROUP:
2222 // In WGP mode the waves of a work-group can be executing on either CU of
2223 // the WGP. Therefore need to wait for operations to complete to ensure
2224 // they are visible to waves in the other CU as the L0 is per CU.
2225 // Otherwise in CU mode and all waves of a work-group are on the same CU
2226 // which shares the same L0.
2227 if (!ST.isCuModeEnabled()) {
2228 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2229 LOADCnt |= true;
2230 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2231 STORECnt |= true;
2232 }
2233 break;
2234 case SIAtomicScope::WAVEFRONT:
2235 case SIAtomicScope::SINGLETHREAD:
2236 // The L0 cache keeps all memory operations in order for
2237 // work-items in the same wavefront.
2238 break;
2239 default:
2240 llvm_unreachable("Unsupported synchronization scope");
2241 }
2242 }
2243
2244 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2245 switch (Scope) {
2246 case SIAtomicScope::SYSTEM:
2247 case SIAtomicScope::AGENT:
2248 case SIAtomicScope::WORKGROUP:
2249 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2250 // not needed as LDS operations for all waves are executed in a total
2251 // global ordering as observed by all waves. Required if also
2252 // synchronizing with global/GDS memory as LDS operations could be
2253 // reordered with respect to later global/GDS memory operations of the
2254 // same wave.
2255 DSCnt |= IsCrossAddrSpaceOrdering;
2256 break;
2257 case SIAtomicScope::WAVEFRONT:
2258 case SIAtomicScope::SINGLETHREAD:
2259 // The LDS keeps all memory operations in order for
2260 // the same wavefront.
2261 break;
2262 default:
2263 llvm_unreachable("Unsupported synchronization scope");
2264 }
2265 }
2266
2267 if (LOADCnt) {
2268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2269 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2270 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2271 Changed = true;
2272 }
2273
2274 if (STORECnt) {
2275 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2276 Changed = true;
2277 }
2278
2279 if (DSCnt) {
2280 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2281 Changed = true;
2282 }
2283
2284 if (Pos == Position::AFTER)
2285 --MI;
2286
2287 return Changed;
2288 }
2289
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const2290 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2291 SIAtomicScope Scope,
2292 SIAtomicAddrSpace AddrSpace,
2293 Position Pos) const {
2294 if (!InsertCacheInv)
2295 return false;
2296
2297 MachineBasicBlock &MBB = *MI->getParent();
2298 DebugLoc DL = MI->getDebugLoc();
2299
2300 /// The scratch address space does not need the global memory cache
2301 /// to be flushed as all memory operations by the same thread are
2302 /// sequentially consistent, and no other thread can access scratch
2303 /// memory.
2304
2305 /// Other address spaces do not have a cache.
2306 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2307 return false;
2308
2309 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2310 switch (Scope) {
2311 case SIAtomicScope::SYSTEM:
2312 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2313 break;
2314 case SIAtomicScope::AGENT:
2315 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2316 break;
2317 case SIAtomicScope::WORKGROUP:
2318 // In WGP mode the waves of a work-group can be executing on either CU of
2319 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2320 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2321 // the L0 does not need to be invalidated.
2322 if (ST.isCuModeEnabled())
2323 return false;
2324
2325 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2326 break;
2327 case SIAtomicScope::WAVEFRONT:
2328 case SIAtomicScope::SINGLETHREAD:
2329 // No cache to invalidate.
2330 return false;
2331 default:
2332 llvm_unreachable("Unsupported synchronization scope");
2333 }
2334
2335 if (Pos == Position::AFTER)
2336 ++MI;
2337
2338 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2339
2340 if (Pos == Position::AFTER)
2341 --MI;
2342
2343 return true;
2344 }
2345
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const2346 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2347 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2348 bool IsVolatile, bool IsNonTemporal) const {
2349
2350 // Only handle load and store, not atomic read-modify-write instructions.
2351 assert(MI->mayLoad() ^ MI->mayStore());
2352
2353 // Only update load and store, not LLVM IR atomic read-modify-write
2354 // instructions. The latter are always marked as volatile so cannot sensibly
2355 // handle it as do not want to pessimize all atomics. Also they do not support
2356 // the nontemporal attribute.
2357 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2358
2359 bool Changed = false;
2360
2361 if (IsNonTemporal) {
2362 // Set non-temporal hint for all cache levels.
2363 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2364 }
2365
2366 if (IsVolatile) {
2367 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2368
2369 // Ensure operation has completed at system scope to cause all volatile
2370 // operations to be visible outside the program in a global order. Do not
2371 // request cross address space as only the global address space can be
2372 // observable outside the program, so no need to cause a waitcnt for LDS
2373 // address space operations.
2374 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2375 Position::AFTER);
2376 }
2377
2378 return Changed;
2379 }
2380
removeAtomicPseudoMIs()2381 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2382 if (AtomicPseudoMIs.empty())
2383 return false;
2384
2385 for (auto &MI : AtomicPseudoMIs)
2386 MI->eraseFromParent();
2387
2388 AtomicPseudoMIs.clear();
2389 return true;
2390 }
2391
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2392 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2393 MachineBasicBlock::iterator &MI) {
2394 assert(MI->mayLoad() && !MI->mayStore());
2395
2396 bool Changed = false;
2397
2398 if (MOI.isAtomic()) {
2399 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2400 MOI.getOrdering() == AtomicOrdering::Acquire ||
2401 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2402 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2403 MOI.getOrderingAddrSpace());
2404 }
2405
2406 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2407 Changed |= CC->insertWait(MI, MOI.getScope(),
2408 MOI.getOrderingAddrSpace(),
2409 SIMemOp::LOAD | SIMemOp::STORE,
2410 MOI.getIsCrossAddressSpaceOrdering(),
2411 Position::BEFORE);
2412
2413 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2414 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2415 Changed |= CC->insertWait(MI, MOI.getScope(),
2416 MOI.getInstrAddrSpace(),
2417 SIMemOp::LOAD,
2418 MOI.getIsCrossAddressSpaceOrdering(),
2419 Position::AFTER);
2420 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2421 MOI.getOrderingAddrSpace(),
2422 Position::AFTER);
2423 }
2424
2425 return Changed;
2426 }
2427
2428 // Atomic instructions already bypass caches to the scope specified by the
2429 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2430 // need additional treatment.
2431 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2432 SIMemOp::LOAD, MOI.isVolatile(),
2433 MOI.isNonTemporal());
2434 return Changed;
2435 }
2436
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2437 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2438 MachineBasicBlock::iterator &MI) {
2439 assert(!MI->mayLoad() && MI->mayStore());
2440
2441 bool Changed = false;
2442
2443 if (MOI.isAtomic()) {
2444 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2445 MOI.getOrdering() == AtomicOrdering::Release ||
2446 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2447 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2448 MOI.getOrderingAddrSpace());
2449 }
2450
2451 if (MOI.getOrdering() == AtomicOrdering::Release ||
2452 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2453 Changed |= CC->insertRelease(MI, MOI.getScope(),
2454 MOI.getOrderingAddrSpace(),
2455 MOI.getIsCrossAddressSpaceOrdering(),
2456 Position::BEFORE);
2457
2458 return Changed;
2459 }
2460
2461 // Atomic instructions already bypass caches to the scope specified by the
2462 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2463 // need additional treatment.
2464 Changed |= CC->enableVolatileAndOrNonTemporal(
2465 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2466 MOI.isNonTemporal());
2467 return Changed;
2468 }
2469
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2470 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2471 MachineBasicBlock::iterator &MI) {
2472 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2473
2474 AtomicPseudoMIs.push_back(MI);
2475 bool Changed = false;
2476
2477 if (MOI.isAtomic()) {
2478 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2479 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2480 SIMemOp::LOAD | SIMemOp::STORE,
2481 MOI.getIsCrossAddressSpaceOrdering(),
2482 Position::BEFORE);
2483
2484 if (MOI.getOrdering() == AtomicOrdering::Release ||
2485 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2486 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2487 /// TODO: This relies on a barrier always generating a waitcnt
2488 /// for LDS to ensure it is not reordered with the completion of
2489 /// the proceeding LDS operations. If barrier had a memory
2490 /// ordering and memory scope, then library does not need to
2491 /// generate a fence. Could add support in this file for
2492 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2493 /// adding S_WAITCNT before a S_BARRIER.
2494 Changed |= CC->insertRelease(MI, MOI.getScope(),
2495 MOI.getOrderingAddrSpace(),
2496 MOI.getIsCrossAddressSpaceOrdering(),
2497 Position::BEFORE);
2498
2499 // TODO: If both release and invalidate are happening they could be combined
2500 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2501 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2502 // track cache invalidate and write back instructions.
2503
2504 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2505 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2506 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2507 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2508 MOI.getOrderingAddrSpace(),
2509 Position::BEFORE);
2510
2511 return Changed;
2512 }
2513
2514 return Changed;
2515 }
2516
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2517 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2518 MachineBasicBlock::iterator &MI) {
2519 assert(MI->mayLoad() && MI->mayStore());
2520
2521 bool Changed = false;
2522
2523 if (MOI.isAtomic()) {
2524 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2525 MOI.getOrdering() == AtomicOrdering::Acquire ||
2526 MOI.getOrdering() == AtomicOrdering::Release ||
2527 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2528 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2529 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2530 MOI.getInstrAddrSpace());
2531 }
2532
2533 if (MOI.getOrdering() == AtomicOrdering::Release ||
2534 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2535 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2536 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2537 Changed |= CC->insertRelease(MI, MOI.getScope(),
2538 MOI.getOrderingAddrSpace(),
2539 MOI.getIsCrossAddressSpaceOrdering(),
2540 Position::BEFORE);
2541
2542 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2543 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2544 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2545 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2546 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2547 Changed |= CC->insertWait(MI, MOI.getScope(),
2548 MOI.getInstrAddrSpace(),
2549 isAtomicRet(*MI) ? SIMemOp::LOAD :
2550 SIMemOp::STORE,
2551 MOI.getIsCrossAddressSpaceOrdering(),
2552 Position::AFTER);
2553 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2554 MOI.getOrderingAddrSpace(),
2555 Position::AFTER);
2556 }
2557
2558 return Changed;
2559 }
2560
2561 return Changed;
2562 }
2563
runOnMachineFunction(MachineFunction & MF)2564 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2565 bool Changed = false;
2566
2567 SIMemOpAccess MOA(MF);
2568 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2569
2570 for (auto &MBB : MF) {
2571 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2572
2573 // Unbundle instructions after the post-RA scheduler.
2574 if (MI->isBundle() && MI->mayLoadOrStore()) {
2575 MachineBasicBlock::instr_iterator II(MI->getIterator());
2576 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2577 I != E && I->isBundledWithPred(); ++I) {
2578 I->unbundleFromPred();
2579 for (MachineOperand &MO : I->operands())
2580 if (MO.isReg())
2581 MO.setIsInternalRead(false);
2582 }
2583
2584 MI->eraseFromParent();
2585 MI = II->getIterator();
2586 }
2587
2588 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2589 continue;
2590
2591 if (const auto &MOI = MOA.getLoadInfo(MI))
2592 Changed |= expandLoad(*MOI, MI);
2593 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2594 Changed |= expandStore(*MOI, MI);
2595 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2596 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2597 Changed |= expandAtomicFence(*MOI, MI);
2598 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2599 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2600 }
2601 }
2602
2603 Changed |= removeAtomicPseudoMIs();
2604 return Changed;
2605 }
2606
2607 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2608
2609 char SIMemoryLegalizer::ID = 0;
2610 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2611
createSIMemoryLegalizerPass()2612 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2613 return new SIMemoryLegalizer();
2614 }
2615