1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34 cl::desc("Use this to skip inserting cache invalidating instructions."));
35
36 namespace {
37
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42 NONE = 0u,
43 LOAD = 1u << 0,
44 STORE = 1u << 1,
45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51 BEFORE,
52 AFTER
53 };
54
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57 NONE,
58 SINGLETHREAD,
59 WAVEFRONT,
60 WORKGROUP,
61 AGENT,
62 SYSTEM
63 };
64
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68 NONE = 0u,
69 GLOBAL = 1u << 0,
70 LDS = 1u << 1,
71 SCRATCH = 1u << 2,
72 GDS = 1u << 3,
73 OTHER = 1u << 4,
74
75 /// The address spaces that can be accessed by a FLAT instruction.
76 FLAT = GLOBAL | LDS | SCRATCH,
77
78 /// The address spaces that support atomic instructions.
79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80
81 /// All address spaces.
82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83
84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86
87 class SIMemOpInfo final {
88 private:
89
90 friend class SIMemOpAccess;
91
92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97 bool IsCrossAddressSpaceOrdering = false;
98 bool IsVolatile = false;
99 bool IsNonTemporal = false;
100
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105 bool IsCrossAddressSpaceOrdering = true,
106 AtomicOrdering FailureOrdering =
107 AtomicOrdering::SequentiallyConsistent,
108 bool IsVolatile = false,
109 bool IsNonTemporal = false)
110 : Ordering(Ordering), FailureOrdering(FailureOrdering),
111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112 InstrAddrSpace(InstrAddrSpace),
113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114 IsVolatile(IsVolatile),
115 IsNonTemporal(IsNonTemporal) {
116
117 if (Ordering == AtomicOrdering::NotAtomic) {
118 assert(Scope == SIAtomicScope::NONE &&
119 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120 !IsCrossAddressSpaceOrdering &&
121 FailureOrdering == AtomicOrdering::NotAtomic);
122 return;
123 }
124
125 assert(Scope != SIAtomicScope::NONE &&
126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127 SIAtomicAddrSpace::NONE &&
128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129 SIAtomicAddrSpace::NONE &&
130 !isStrongerThan(FailureOrdering, Ordering));
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155 public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
getScope() const158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
getOrdering() const164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
getFailureOrdering() const170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210 };
211
212 class SIMemOpAccess final {
213 private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 Optional<SIMemOpInfo> constructFromMIWithMMO(
233 const MachineBasicBlock::iterator &MI) const;
234
235 public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "None" otherwise.
241 Optional<SIMemOpInfo> getLoadInfo(
242 const MachineBasicBlock::iterator &MI) const;
243
244 /// \returns Store info if \p MI is a store operation, "None" otherwise.
245 Optional<SIMemOpInfo> getStoreInfo(
246 const MachineBasicBlock::iterator &MI) const;
247
248 /// \returns Atomic fence info if \p MI is an atomic fence operation,
249 /// "None" otherwise.
250 Optional<SIMemOpInfo> getAtomicFenceInfo(
251 const MachineBasicBlock::iterator &MI) const;
252
253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254 /// rmw operation, "None" otherwise.
255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256 const MachineBasicBlock::iterator &MI) const;
257 };
258
259 class SICacheControl {
260 protected:
261
262 /// AMDGPU subtarget info.
263 const GCNSubtarget &ST;
264
265 /// Instruction info.
266 const SIInstrInfo *TII = nullptr;
267
268 IsaVersion IV;
269
270 /// Whether to insert cache invalidating instructions.
271 bool InsertCacheInv;
272
273 SICacheControl(const GCNSubtarget &ST);
274
275 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276 /// \returns Returns true if \p MI is modified, false otherwise.
277 bool enableNamedBit(const MachineBasicBlock::iterator MI,
278 AMDGPU::CPol::CPol Bit) const;
279
280 public:
281
282 /// Create a cache control for the subtarget \p ST.
283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284
285 /// Update \p MI memory load instruction to bypass any caches up to
286 /// the \p Scope memory scope for address spaces \p
287 /// AddrSpace. Return true iff the instruction was modified.
288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289 SIAtomicScope Scope,
290 SIAtomicAddrSpace AddrSpace) const = 0;
291
292 /// Update \p MI memory store instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory read-modify-write instruction to bypass any caches up
300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301 /// iff the instruction was modified.
302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory instruction of kind \p Op associated with address
307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308 /// true iff the instruction was modified.
309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310 SIAtomicAddrSpace AddrSpace,
311 SIMemOp Op, bool IsVolatile,
312 bool IsNonTemporal) const = 0;
313
314 /// Inserts any necessary instructions at position \p Pos relative
315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316 /// \p Op associated with address spaces \p AddrSpace have completed. Used
317 /// between memory instructions to enforce the order they become visible as
318 /// observed by other memory instructions executing in memory scope \p Scope.
319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320 /// address spaces. Returns true iff any instructions inserted.
321 virtual bool insertWait(MachineBasicBlock::iterator &MI,
322 SIAtomicScope Scope,
323 SIAtomicAddrSpace AddrSpace,
324 SIMemOp Op,
325 bool IsCrossAddrSpaceOrdering,
326 Position Pos) const = 0;
327
328 /// Inserts any necessary instructions at position \p Pos relative to
329 /// instruction \p MI to ensure any subsequent memory instructions of this
330 /// thread with address spaces \p AddrSpace will observe the previous memory
331 /// operations by any thread for memory scopes up to memory scope \p Scope .
332 /// Returns true iff any instructions inserted.
333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 Position Pos) const = 0;
337
338 /// Inserts any necessary instructions at position \p Pos relative to
339 /// instruction \p MI to ensure previous memory instructions by this thread
340 /// with address spaces \p AddrSpace have completed and can be observed by
341 /// subsequent memory instructions by any thread executing in memory scope \p
342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343 /// between address spaces. Returns true iff any instructions inserted.
344 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345 SIAtomicScope Scope,
346 SIAtomicAddrSpace AddrSpace,
347 bool IsCrossAddrSpaceOrdering,
348 Position Pos) const = 0;
349
350 /// Virtual destructor to allow derivations to be deleted.
351 virtual ~SICacheControl() = default;
352
353 };
354
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357
358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361 return enableNamedBit(MI, AMDGPU::CPol::GLC);
362 }
363
364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367 return enableNamedBit(MI, AMDGPU::CPol::SLC);
368 }
369
370 public:
371
SIGfx6CacheControl(const GCNSubtarget & ST)372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373
374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375 SIAtomicScope Scope,
376 SIAtomicAddrSpace AddrSpace) const override;
377
378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379 SIAtomicScope Scope,
380 SIAtomicAddrSpace AddrSpace) const override;
381
382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383 SIAtomicScope Scope,
384 SIAtomicAddrSpace AddrSpace) const override;
385
386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388 bool IsVolatile,
389 bool IsNonTemporal) const override;
390
391 bool insertWait(MachineBasicBlock::iterator &MI,
392 SIAtomicScope Scope,
393 SIAtomicAddrSpace AddrSpace,
394 SIMemOp Op,
395 bool IsCrossAddrSpaceOrdering,
396 Position Pos) const override;
397
398 bool insertAcquire(MachineBasicBlock::iterator &MI,
399 SIAtomicScope Scope,
400 SIAtomicAddrSpace AddrSpace,
401 Position Pos) const override;
402
403 bool insertRelease(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 bool IsCrossAddrSpaceOrdering,
407 Position Pos) const override;
408 };
409
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412
SIGfx7CacheControl(const GCNSubtarget & ST)413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414
415 bool insertAcquire(MachineBasicBlock::iterator &MI,
416 SIAtomicScope Scope,
417 SIAtomicAddrSpace AddrSpace,
418 Position Pos) const override;
419
420 };
421
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424
SIGfx90ACacheControl(const GCNSubtarget & ST)425 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
426
427 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428 SIAtomicScope Scope,
429 SIAtomicAddrSpace AddrSpace) const override;
430
431 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace) const override;
434
435 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436 SIAtomicScope Scope,
437 SIAtomicAddrSpace AddrSpace) const override;
438
439 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441 bool IsVolatile,
442 bool IsNonTemporal) const override;
443
444 bool insertWait(MachineBasicBlock::iterator &MI,
445 SIAtomicScope Scope,
446 SIAtomicAddrSpace AddrSpace,
447 SIMemOp Op,
448 bool IsCrossAddrSpaceOrdering,
449 Position Pos) const override;
450
451 bool insertAcquire(MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace,
454 Position Pos) const override;
455
456 bool insertRelease(MachineBasicBlock::iterator &MI,
457 SIAtomicScope Scope,
458 SIAtomicAddrSpace AddrSpace,
459 bool IsCrossAddrSpaceOrdering,
460 Position Pos) const override;
461 };
462
463 class SIGfx10CacheControl : public SIGfx7CacheControl {
464 protected:
465
466 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
467 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const468 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
469 return enableNamedBit(MI, AMDGPU::CPol::DLC);
470 }
471
472 public:
473
SIGfx10CacheControl(const GCNSubtarget & ST)474 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
475
476 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
477 SIAtomicScope Scope,
478 SIAtomicAddrSpace AddrSpace) const override;
479
480 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
481 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
482 bool IsVolatile,
483 bool IsNonTemporal) const override;
484
485 bool insertWait(MachineBasicBlock::iterator &MI,
486 SIAtomicScope Scope,
487 SIAtomicAddrSpace AddrSpace,
488 SIMemOp Op,
489 bool IsCrossAddrSpaceOrdering,
490 Position Pos) const override;
491
492 bool insertAcquire(MachineBasicBlock::iterator &MI,
493 SIAtomicScope Scope,
494 SIAtomicAddrSpace AddrSpace,
495 Position Pos) const override;
496 };
497
498 class SIMemoryLegalizer final : public MachineFunctionPass {
499 private:
500
501 /// Cache Control.
502 std::unique_ptr<SICacheControl> CC = nullptr;
503
504 /// List of atomic pseudo instructions.
505 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
506
507 /// Return true iff instruction \p MI is a atomic instruction that
508 /// returns a result.
isAtomicRet(const MachineInstr & MI) const509 bool isAtomicRet(const MachineInstr &MI) const {
510 return SIInstrInfo::isAtomicRet(MI);
511 }
512
513 /// Removes all processed atomic pseudo instructions from the current
514 /// function. Returns true if current function is modified, false otherwise.
515 bool removeAtomicPseudoMIs();
516
517 /// Expands load operation \p MI. Returns true if instructions are
518 /// added/deleted or \p MI is modified, false otherwise.
519 bool expandLoad(const SIMemOpInfo &MOI,
520 MachineBasicBlock::iterator &MI);
521 /// Expands store operation \p MI. Returns true if instructions are
522 /// added/deleted or \p MI is modified, false otherwise.
523 bool expandStore(const SIMemOpInfo &MOI,
524 MachineBasicBlock::iterator &MI);
525 /// Expands atomic fence operation \p MI. Returns true if
526 /// instructions are added/deleted or \p MI is modified, false otherwise.
527 bool expandAtomicFence(const SIMemOpInfo &MOI,
528 MachineBasicBlock::iterator &MI);
529 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
530 /// instructions are added/deleted or \p MI is modified, false otherwise.
531 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
532 MachineBasicBlock::iterator &MI);
533
534 public:
535 static char ID;
536
SIMemoryLegalizer()537 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
538
getAnalysisUsage(AnalysisUsage & AU) const539 void getAnalysisUsage(AnalysisUsage &AU) const override {
540 AU.setPreservesCFG();
541 MachineFunctionPass::getAnalysisUsage(AU);
542 }
543
getPassName() const544 StringRef getPassName() const override {
545 return PASS_NAME;
546 }
547
548 bool runOnMachineFunction(MachineFunction &MF) override;
549 };
550
551 } // end namespace anonymous
552
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const553 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
554 const char *Msg) const {
555 const Function &Func = MI->getParent()->getParent()->getFunction();
556 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
557 Func.getContext().diagnose(Diag);
558 }
559
560 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const561 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
562 SIAtomicAddrSpace InstrAddrSpace) const {
563 if (SSID == SyncScope::System)
564 return std::make_tuple(SIAtomicScope::SYSTEM,
565 SIAtomicAddrSpace::ATOMIC,
566 true);
567 if (SSID == MMI->getAgentSSID())
568 return std::make_tuple(SIAtomicScope::AGENT,
569 SIAtomicAddrSpace::ATOMIC,
570 true);
571 if (SSID == MMI->getWorkgroupSSID())
572 return std::make_tuple(SIAtomicScope::WORKGROUP,
573 SIAtomicAddrSpace::ATOMIC,
574 true);
575 if (SSID == MMI->getWavefrontSSID())
576 return std::make_tuple(SIAtomicScope::WAVEFRONT,
577 SIAtomicAddrSpace::ATOMIC,
578 true);
579 if (SSID == SyncScope::SingleThread)
580 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
581 SIAtomicAddrSpace::ATOMIC,
582 true);
583 if (SSID == MMI->getSystemOneAddressSpaceSSID())
584 return std::make_tuple(SIAtomicScope::SYSTEM,
585 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
586 false);
587 if (SSID == MMI->getAgentOneAddressSpaceSSID())
588 return std::make_tuple(SIAtomicScope::AGENT,
589 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
590 false);
591 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
592 return std::make_tuple(SIAtomicScope::WORKGROUP,
593 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
594 false);
595 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
596 return std::make_tuple(SIAtomicScope::WAVEFRONT,
597 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
598 false);
599 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
600 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
601 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
602 false);
603 return None;
604 }
605
toSIAtomicAddrSpace(unsigned AS) const606 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
607 if (AS == AMDGPUAS::FLAT_ADDRESS)
608 return SIAtomicAddrSpace::FLAT;
609 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
610 return SIAtomicAddrSpace::GLOBAL;
611 if (AS == AMDGPUAS::LOCAL_ADDRESS)
612 return SIAtomicAddrSpace::LDS;
613 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
614 return SIAtomicAddrSpace::SCRATCH;
615 if (AS == AMDGPUAS::REGION_ADDRESS)
616 return SIAtomicAddrSpace::GDS;
617
618 return SIAtomicAddrSpace::OTHER;
619 }
620
SIMemOpAccess(MachineFunction & MF)621 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
622 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
623 }
624
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const625 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
626 const MachineBasicBlock::iterator &MI) const {
627 assert(MI->getNumMemOperands() > 0);
628
629 SyncScope::ID SSID = SyncScope::SingleThread;
630 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
631 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
632 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
633 bool IsNonTemporal = true;
634 bool IsVolatile = false;
635
636 // Validator should check whether or not MMOs cover the entire set of
637 // locations accessed by the memory instruction.
638 for (const auto &MMO : MI->memoperands()) {
639 IsNonTemporal &= MMO->isNonTemporal();
640 IsVolatile |= MMO->isVolatile();
641 InstrAddrSpace |=
642 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
643 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
644 if (OpOrdering != AtomicOrdering::NotAtomic) {
645 const auto &IsSyncScopeInclusion =
646 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
647 if (!IsSyncScopeInclusion) {
648 reportUnsupported(MI,
649 "Unsupported non-inclusive atomic synchronization scope");
650 return None;
651 }
652
653 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
654 Ordering = isStrongerThan(Ordering, OpOrdering)
655 ? Ordering
656 : MMO->getSuccessOrdering();
657 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
658 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
659 FailureOrdering =
660 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
661 FailureOrdering : MMO->getFailureOrdering();
662 }
663 }
664
665 SIAtomicScope Scope = SIAtomicScope::NONE;
666 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
667 bool IsCrossAddressSpaceOrdering = false;
668 if (Ordering != AtomicOrdering::NotAtomic) {
669 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
670 if (!ScopeOrNone) {
671 reportUnsupported(MI, "Unsupported atomic synchronization scope");
672 return None;
673 }
674 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
675 ScopeOrNone.getValue();
676 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
677 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
678 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
679 reportUnsupported(MI, "Unsupported atomic address space");
680 return None;
681 }
682 }
683 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
684 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
685 IsNonTemporal);
686 }
687
getLoadInfo(const MachineBasicBlock::iterator & MI) const688 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
689 const MachineBasicBlock::iterator &MI) const {
690 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
691
692 if (!(MI->mayLoad() && !MI->mayStore()))
693 return None;
694
695 // Be conservative if there are no memory operands.
696 if (MI->getNumMemOperands() == 0)
697 return SIMemOpInfo();
698
699 return constructFromMIWithMMO(MI);
700 }
701
getStoreInfo(const MachineBasicBlock::iterator & MI) const702 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
703 const MachineBasicBlock::iterator &MI) const {
704 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
705
706 if (!(!MI->mayLoad() && MI->mayStore()))
707 return None;
708
709 // Be conservative if there are no memory operands.
710 if (MI->getNumMemOperands() == 0)
711 return SIMemOpInfo();
712
713 return constructFromMIWithMMO(MI);
714 }
715
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const716 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
717 const MachineBasicBlock::iterator &MI) const {
718 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
719
720 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
721 return None;
722
723 AtomicOrdering Ordering =
724 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
725
726 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
727 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
728 if (!ScopeOrNone) {
729 reportUnsupported(MI, "Unsupported atomic synchronization scope");
730 return None;
731 }
732
733 SIAtomicScope Scope = SIAtomicScope::NONE;
734 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
735 bool IsCrossAddressSpaceOrdering = false;
736 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
737 ScopeOrNone.getValue();
738
739 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
740 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
741 reportUnsupported(MI, "Unsupported atomic address space");
742 return None;
743 }
744
745 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
746 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
747 }
748
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const749 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
750 const MachineBasicBlock::iterator &MI) const {
751 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752
753 if (!(MI->mayLoad() && MI->mayStore()))
754 return None;
755
756 // Be conservative if there are no memory operands.
757 if (MI->getNumMemOperands() == 0)
758 return SIMemOpInfo();
759
760 return constructFromMIWithMMO(MI);
761 }
762
SICacheControl(const GCNSubtarget & ST)763 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
764 TII = ST.getInstrInfo();
765 IV = getIsaVersion(ST.getCPU());
766 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
767 }
768
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const769 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
770 AMDGPU::CPol::CPol Bit) const {
771 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
772 if (!CPol)
773 return false;
774
775 CPol->setImm(CPol->getImm() | Bit);
776 return true;
777 }
778
779 /* static */
create(const GCNSubtarget & ST)780 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
781 GCNSubtarget::Generation Generation = ST.getGeneration();
782 if (ST.hasGFX90AInsts())
783 return std::make_unique<SIGfx90ACacheControl>(ST);
784 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
785 return std::make_unique<SIGfx6CacheControl>(ST);
786 if (Generation < AMDGPUSubtarget::GFX10)
787 return std::make_unique<SIGfx7CacheControl>(ST);
788 return std::make_unique<SIGfx10CacheControl>(ST);
789 }
790
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const791 bool SIGfx6CacheControl::enableLoadCacheBypass(
792 const MachineBasicBlock::iterator &MI,
793 SIAtomicScope Scope,
794 SIAtomicAddrSpace AddrSpace) const {
795 assert(MI->mayLoad() && !MI->mayStore());
796 bool Changed = false;
797
798 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
799 switch (Scope) {
800 case SIAtomicScope::SYSTEM:
801 case SIAtomicScope::AGENT:
802 Changed |= enableGLCBit(MI);
803 break;
804 case SIAtomicScope::WORKGROUP:
805 case SIAtomicScope::WAVEFRONT:
806 case SIAtomicScope::SINGLETHREAD:
807 // No cache to bypass.
808 break;
809 default:
810 llvm_unreachable("Unsupported synchronization scope");
811 }
812 }
813
814 /// The scratch address space does not need the global memory caches
815 /// to be bypassed as all memory operations by the same thread are
816 /// sequentially consistent, and no other thread can access scratch
817 /// memory.
818
819 /// Other address spaces do not have a cache.
820
821 return Changed;
822 }
823
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const824 bool SIGfx6CacheControl::enableStoreCacheBypass(
825 const MachineBasicBlock::iterator &MI,
826 SIAtomicScope Scope,
827 SIAtomicAddrSpace AddrSpace) const {
828 assert(!MI->mayLoad() && MI->mayStore());
829 bool Changed = false;
830
831 /// The L1 cache is write through so does not need to be bypassed. There is no
832 /// bypass control for the L2 cache at the isa level.
833
834 return Changed;
835 }
836
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const837 bool SIGfx6CacheControl::enableRMWCacheBypass(
838 const MachineBasicBlock::iterator &MI,
839 SIAtomicScope Scope,
840 SIAtomicAddrSpace AddrSpace) const {
841 assert(MI->mayLoad() && MI->mayStore());
842 bool Changed = false;
843
844 /// The L1 cache is write through so does not need to be bypassed. There is no
845 /// bypass control for the L2 cache at the isa level.
846
847 return Changed;
848 }
849
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
851 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
852 bool IsVolatile, bool IsNonTemporal) const {
853 // Only handle load and store, not atomic read-modify-write insructions. The
854 // latter use glc to indicate if the atomic returns a result and so must not
855 // be used for cache control.
856 assert(MI->mayLoad() ^ MI->mayStore());
857
858 // Only update load and store, not LLVM IR atomic read-modify-write
859 // instructions. The latter are always marked as volatile so cannot sensibly
860 // handle it as do not want to pessimize all atomics. Also they do not support
861 // the nontemporal attribute.
862 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
863
864 bool Changed = false;
865
866 if (IsVolatile) {
867 if (Op == SIMemOp::LOAD)
868 Changed |= enableGLCBit(MI);
869
870 // Ensure operation has completed at system scope to cause all volatile
871 // operations to be visible outside the program in a global order. Do not
872 // request cross address space as only the global address space can be
873 // observable outside the program, so no need to cause a waitcnt for LDS
874 // address space operations.
875 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
876 Position::AFTER);
877
878 return Changed;
879 }
880
881 if (IsNonTemporal) {
882 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
883 Changed |= enableGLCBit(MI);
884 Changed |= enableSLCBit(MI);
885 return Changed;
886 }
887
888 return Changed;
889 }
890
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const891 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
892 SIAtomicScope Scope,
893 SIAtomicAddrSpace AddrSpace,
894 SIMemOp Op,
895 bool IsCrossAddrSpaceOrdering,
896 Position Pos) const {
897 bool Changed = false;
898
899 MachineBasicBlock &MBB = *MI->getParent();
900 DebugLoc DL = MI->getDebugLoc();
901
902 if (Pos == Position::AFTER)
903 ++MI;
904
905 bool VMCnt = false;
906 bool LGKMCnt = false;
907
908 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
909 SIAtomicAddrSpace::NONE) {
910 switch (Scope) {
911 case SIAtomicScope::SYSTEM:
912 case SIAtomicScope::AGENT:
913 VMCnt |= true;
914 break;
915 case SIAtomicScope::WORKGROUP:
916 case SIAtomicScope::WAVEFRONT:
917 case SIAtomicScope::SINGLETHREAD:
918 // The L1 cache keeps all memory operations in order for
919 // wavefronts in the same work-group.
920 break;
921 default:
922 llvm_unreachable("Unsupported synchronization scope");
923 }
924 }
925
926 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
927 switch (Scope) {
928 case SIAtomicScope::SYSTEM:
929 case SIAtomicScope::AGENT:
930 case SIAtomicScope::WORKGROUP:
931 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
932 // not needed as LDS operations for all waves are executed in a total
933 // global ordering as observed by all waves. Required if also
934 // synchronizing with global/GDS memory as LDS operations could be
935 // reordered with respect to later global/GDS memory operations of the
936 // same wave.
937 LGKMCnt |= IsCrossAddrSpaceOrdering;
938 break;
939 case SIAtomicScope::WAVEFRONT:
940 case SIAtomicScope::SINGLETHREAD:
941 // The LDS keeps all memory operations in order for
942 // the same wavesfront.
943 break;
944 default:
945 llvm_unreachable("Unsupported synchronization scope");
946 }
947 }
948
949 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
950 switch (Scope) {
951 case SIAtomicScope::SYSTEM:
952 case SIAtomicScope::AGENT:
953 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
954 // is not needed as GDS operations for all waves are executed in a total
955 // global ordering as observed by all waves. Required if also
956 // synchronizing with global/LDS memory as GDS operations could be
957 // reordered with respect to later global/LDS memory operations of the
958 // same wave.
959 LGKMCnt |= IsCrossAddrSpaceOrdering;
960 break;
961 case SIAtomicScope::WORKGROUP:
962 case SIAtomicScope::WAVEFRONT:
963 case SIAtomicScope::SINGLETHREAD:
964 // The GDS keeps all memory operations in order for
965 // the same work-group.
966 break;
967 default:
968 llvm_unreachable("Unsupported synchronization scope");
969 }
970 }
971
972 if (VMCnt || LGKMCnt) {
973 unsigned WaitCntImmediate =
974 AMDGPU::encodeWaitcnt(IV,
975 VMCnt ? 0 : getVmcntBitMask(IV),
976 getExpcntBitMask(IV),
977 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
979 Changed = true;
980 }
981
982 if (Pos == Position::AFTER)
983 --MI;
984
985 return Changed;
986 }
987
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const988 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
989 SIAtomicScope Scope,
990 SIAtomicAddrSpace AddrSpace,
991 Position Pos) const {
992 if (!InsertCacheInv)
993 return false;
994
995 bool Changed = false;
996
997 MachineBasicBlock &MBB = *MI->getParent();
998 DebugLoc DL = MI->getDebugLoc();
999
1000 if (Pos == Position::AFTER)
1001 ++MI;
1002
1003 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1004 switch (Scope) {
1005 case SIAtomicScope::SYSTEM:
1006 case SIAtomicScope::AGENT:
1007 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1008 Changed = true;
1009 break;
1010 case SIAtomicScope::WORKGROUP:
1011 case SIAtomicScope::WAVEFRONT:
1012 case SIAtomicScope::SINGLETHREAD:
1013 // No cache to invalidate.
1014 break;
1015 default:
1016 llvm_unreachable("Unsupported synchronization scope");
1017 }
1018 }
1019
1020 /// The scratch address space does not need the global memory cache
1021 /// to be flushed as all memory operations by the same thread are
1022 /// sequentially consistent, and no other thread can access scratch
1023 /// memory.
1024
1025 /// Other address spaces do not have a cache.
1026
1027 if (Pos == Position::AFTER)
1028 --MI;
1029
1030 return Changed;
1031 }
1032
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1033 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1034 SIAtomicScope Scope,
1035 SIAtomicAddrSpace AddrSpace,
1036 bool IsCrossAddrSpaceOrdering,
1037 Position Pos) const {
1038 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1039 IsCrossAddrSpaceOrdering, Pos);
1040 }
1041
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1042 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1043 SIAtomicScope Scope,
1044 SIAtomicAddrSpace AddrSpace,
1045 Position Pos) const {
1046 if (!InsertCacheInv)
1047 return false;
1048
1049 bool Changed = false;
1050
1051 MachineBasicBlock &MBB = *MI->getParent();
1052 DebugLoc DL = MI->getDebugLoc();
1053
1054 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1055
1056 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1057 ? AMDGPU::BUFFER_WBINVL1
1058 : AMDGPU::BUFFER_WBINVL1_VOL;
1059
1060 if (Pos == Position::AFTER)
1061 ++MI;
1062
1063 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1064 switch (Scope) {
1065 case SIAtomicScope::SYSTEM:
1066 case SIAtomicScope::AGENT:
1067 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1068 Changed = true;
1069 break;
1070 case SIAtomicScope::WORKGROUP:
1071 case SIAtomicScope::WAVEFRONT:
1072 case SIAtomicScope::SINGLETHREAD:
1073 // No cache to invalidate.
1074 break;
1075 default:
1076 llvm_unreachable("Unsupported synchronization scope");
1077 }
1078 }
1079
1080 /// The scratch address space does not need the global memory cache
1081 /// to be flushed as all memory operations by the same thread are
1082 /// sequentially consistent, and no other thread can access scratch
1083 /// memory.
1084
1085 /// Other address spaces do not have a cache.
1086
1087 if (Pos == Position::AFTER)
1088 --MI;
1089
1090 return Changed;
1091 }
1092
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1093 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1094 const MachineBasicBlock::iterator &MI,
1095 SIAtomicScope Scope,
1096 SIAtomicAddrSpace AddrSpace) const {
1097 assert(MI->mayLoad() && !MI->mayStore());
1098 bool Changed = false;
1099
1100 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1101 switch (Scope) {
1102 case SIAtomicScope::SYSTEM:
1103 case SIAtomicScope::AGENT:
1104 Changed |= enableGLCBit(MI);
1105 break;
1106 case SIAtomicScope::WORKGROUP:
1107 // In threadgroup split mode the waves of a work-group can be executing on
1108 // different CUs. Therefore need to bypass the L1 which is per CU.
1109 // Otherwise in non-threadgroup split mode all waves of a work-group are
1110 // on the same CU, and so the L1 does not need to be bypassed.
1111 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1112 break;
1113 case SIAtomicScope::WAVEFRONT:
1114 case SIAtomicScope::SINGLETHREAD:
1115 // No cache to bypass.
1116 break;
1117 default:
1118 llvm_unreachable("Unsupported synchronization scope");
1119 }
1120 }
1121
1122 /// The scratch address space does not need the global memory caches
1123 /// to be bypassed as all memory operations by the same thread are
1124 /// sequentially consistent, and no other thread can access scratch
1125 /// memory.
1126
1127 /// Other address spaces do not have a cache.
1128
1129 return Changed;
1130 }
1131
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1132 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1133 const MachineBasicBlock::iterator &MI,
1134 SIAtomicScope Scope,
1135 SIAtomicAddrSpace AddrSpace) const {
1136 assert(!MI->mayLoad() && MI->mayStore());
1137 bool Changed = false;
1138
1139 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1140 switch (Scope) {
1141 case SIAtomicScope::SYSTEM:
1142 case SIAtomicScope::AGENT:
1143 /// Do not set glc for store atomic operations as they implicitly write
1144 /// through the L1 cache.
1145 break;
1146 case SIAtomicScope::WORKGROUP:
1147 case SIAtomicScope::WAVEFRONT:
1148 case SIAtomicScope::SINGLETHREAD:
1149 // No cache to bypass. Store atomics implicitly write through the L1
1150 // cache.
1151 break;
1152 default:
1153 llvm_unreachable("Unsupported synchronization scope");
1154 }
1155 }
1156
1157 /// The scratch address space does not need the global memory caches
1158 /// to be bypassed as all memory operations by the same thread are
1159 /// sequentially consistent, and no other thread can access scratch
1160 /// memory.
1161
1162 /// Other address spaces do not have a cache.
1163
1164 return Changed;
1165 }
1166
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1167 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1168 const MachineBasicBlock::iterator &MI,
1169 SIAtomicScope Scope,
1170 SIAtomicAddrSpace AddrSpace) const {
1171 assert(MI->mayLoad() && MI->mayStore());
1172 bool Changed = false;
1173
1174 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1175 switch (Scope) {
1176 case SIAtomicScope::SYSTEM:
1177 case SIAtomicScope::AGENT:
1178 /// Do not set glc for RMW atomic operations as they implicitly bypass
1179 /// the L1 cache, and the glc bit is instead used to indicate if they are
1180 /// return or no-return.
1181 break;
1182 case SIAtomicScope::WORKGROUP:
1183 case SIAtomicScope::WAVEFRONT:
1184 case SIAtomicScope::SINGLETHREAD:
1185 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1186 break;
1187 default:
1188 llvm_unreachable("Unsupported synchronization scope");
1189 }
1190 }
1191
1192 return Changed;
1193 }
1194
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1195 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1196 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1197 bool IsVolatile, bool IsNonTemporal) const {
1198 // Only handle load and store, not atomic read-modify-write insructions. The
1199 // latter use glc to indicate if the atomic returns a result and so must not
1200 // be used for cache control.
1201 assert(MI->mayLoad() ^ MI->mayStore());
1202
1203 // Only update load and store, not LLVM IR atomic read-modify-write
1204 // instructions. The latter are always marked as volatile so cannot sensibly
1205 // handle it as do not want to pessimize all atomics. Also they do not support
1206 // the nontemporal attribute.
1207 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1208
1209 bool Changed = false;
1210
1211 if (IsVolatile) {
1212 if (Op == SIMemOp::LOAD) {
1213 Changed |= enableGLCBit(MI);
1214 }
1215
1216 // Ensure operation has completed at system scope to cause all volatile
1217 // operations to be visible outside the program in a global order. Do not
1218 // request cross address space as only the global address space can be
1219 // observable outside the program, so no need to cause a waitcnt for LDS
1220 // address space operations.
1221 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1222 Position::AFTER);
1223
1224 return Changed;
1225 }
1226
1227 if (IsNonTemporal) {
1228 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1229 Changed |= enableGLCBit(MI);
1230 Changed |= enableSLCBit(MI);
1231 return Changed;
1232 }
1233
1234 return Changed;
1235 }
1236
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1237 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1238 SIAtomicScope Scope,
1239 SIAtomicAddrSpace AddrSpace,
1240 SIMemOp Op,
1241 bool IsCrossAddrSpaceOrdering,
1242 Position Pos) const {
1243 if (ST.isTgSplitEnabled()) {
1244 // In threadgroup split mode the waves of a work-group can be executing on
1245 // different CUs. Therefore need to wait for global or GDS memory operations
1246 // to complete to ensure they are visible to waves in the other CUs.
1247 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1248 // the same CU, so no need to wait for global memory as all waves in the
1249 // work-group access the same the L1, nor wait for GDS as access are ordered
1250 // on a CU.
1251 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1252 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1253 (Scope == SIAtomicScope::WORKGROUP)) {
1254 // Same as GFX7 using agent scope.
1255 Scope = SIAtomicScope::AGENT;
1256 }
1257 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1258 // LDS memory operations.
1259 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1260 }
1261 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1262 IsCrossAddrSpaceOrdering, Pos);
1263 }
1264
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1265 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1266 SIAtomicScope Scope,
1267 SIAtomicAddrSpace AddrSpace,
1268 Position Pos) const {
1269 if (!InsertCacheInv)
1270 return false;
1271
1272 bool Changed = false;
1273
1274 MachineBasicBlock &MBB = *MI->getParent();
1275 DebugLoc DL = MI->getDebugLoc();
1276
1277 if (Pos == Position::AFTER)
1278 ++MI;
1279
1280 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1281 switch (Scope) {
1282 case SIAtomicScope::SYSTEM:
1283 // Ensures that following loads will not see stale remote VMEM data or
1284 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1285 // CC will never be stale due to the local memory probes.
1286 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1287 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1288 // hardware does not reorder memory operations by the same wave with
1289 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1290 // remove any cache lines of earlier writes by the same wave and ensures
1291 // later reads by the same wave will refetch the cache lines.
1292 Changed = true;
1293 break;
1294 case SIAtomicScope::AGENT:
1295 // Same as GFX7.
1296 break;
1297 case SIAtomicScope::WORKGROUP:
1298 // In threadgroup split mode the waves of a work-group can be executing on
1299 // different CUs. Therefore need to invalidate the L1 which is per CU.
1300 // Otherwise in non-threadgroup split mode all waves of a work-group are
1301 // on the same CU, and so the L1 does not need to be invalidated.
1302 if (ST.isTgSplitEnabled()) {
1303 // Same as GFX7 using agent scope.
1304 Scope = SIAtomicScope::AGENT;
1305 }
1306 break;
1307 case SIAtomicScope::WAVEFRONT:
1308 case SIAtomicScope::SINGLETHREAD:
1309 // Same as GFX7.
1310 break;
1311 default:
1312 llvm_unreachable("Unsupported synchronization scope");
1313 }
1314 }
1315
1316 /// The scratch address space does not need the global memory cache
1317 /// to be flushed as all memory operations by the same thread are
1318 /// sequentially consistent, and no other thread can access scratch
1319 /// memory.
1320
1321 /// Other address spaces do not have a cache.
1322
1323 if (Pos == Position::AFTER)
1324 --MI;
1325
1326 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1327
1328 return Changed;
1329 }
1330
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1331 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1332 SIAtomicScope Scope,
1333 SIAtomicAddrSpace AddrSpace,
1334 bool IsCrossAddrSpaceOrdering,
1335 Position Pos) const {
1336 bool Changed = false;
1337
1338 MachineBasicBlock &MBB = *MI->getParent();
1339 DebugLoc DL = MI->getDebugLoc();
1340
1341 if (Pos == Position::AFTER)
1342 ++MI;
1343
1344 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1345 switch (Scope) {
1346 case SIAtomicScope::SYSTEM:
1347 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1348 // hardware does not reorder memory operations by the same wave with
1349 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1350 // to initiate writeback of any dirty cache lines of earlier writes by the
1351 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1352 // writeback has completed.
1353 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1354 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1355 // vmcnt(0)" needed by the "BUFFER_WBL2".
1356 Changed = true;
1357 break;
1358 case SIAtomicScope::AGENT:
1359 case SIAtomicScope::WORKGROUP:
1360 case SIAtomicScope::WAVEFRONT:
1361 case SIAtomicScope::SINGLETHREAD:
1362 // Same as GFX7.
1363 break;
1364 default:
1365 llvm_unreachable("Unsupported synchronization scope");
1366 }
1367 }
1368
1369 if (Pos == Position::AFTER)
1370 --MI;
1371
1372 Changed |=
1373 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1374 IsCrossAddrSpaceOrdering, Pos);
1375
1376 return Changed;
1377 }
1378
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1379 bool SIGfx10CacheControl::enableLoadCacheBypass(
1380 const MachineBasicBlock::iterator &MI,
1381 SIAtomicScope Scope,
1382 SIAtomicAddrSpace AddrSpace) const {
1383 assert(MI->mayLoad() && !MI->mayStore());
1384 bool Changed = false;
1385
1386 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1387 /// TODO Do not set glc for rmw atomic operations as they
1388 /// implicitly bypass the L0/L1 caches.
1389
1390 switch (Scope) {
1391 case SIAtomicScope::SYSTEM:
1392 case SIAtomicScope::AGENT:
1393 Changed |= enableGLCBit(MI);
1394 Changed |= enableDLCBit(MI);
1395 break;
1396 case SIAtomicScope::WORKGROUP:
1397 // In WGP mode the waves of a work-group can be executing on either CU of
1398 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1399 // CU mode all waves of a work-group are on the same CU, and so the L0
1400 // does not need to be bypassed.
1401 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1402 break;
1403 case SIAtomicScope::WAVEFRONT:
1404 case SIAtomicScope::SINGLETHREAD:
1405 // No cache to bypass.
1406 break;
1407 default:
1408 llvm_unreachable("Unsupported synchronization scope");
1409 }
1410 }
1411
1412 /// The scratch address space does not need the global memory caches
1413 /// to be bypassed as all memory operations by the same thread are
1414 /// sequentially consistent, and no other thread can access scratch
1415 /// memory.
1416
1417 /// Other address spaces do not have a cache.
1418
1419 return Changed;
1420 }
1421
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1422 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1423 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1424 bool IsVolatile, bool IsNonTemporal) const {
1425
1426 // Only handle load and store, not atomic read-modify-write insructions. The
1427 // latter use glc to indicate if the atomic returns a result and so must not
1428 // be used for cache control.
1429 assert(MI->mayLoad() ^ MI->mayStore());
1430
1431 // Only update load and store, not LLVM IR atomic read-modify-write
1432 // instructions. The latter are always marked as volatile so cannot sensibly
1433 // handle it as do not want to pessimize all atomics. Also they do not support
1434 // the nontemporal attribute.
1435 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1436
1437 bool Changed = false;
1438
1439 if (IsVolatile) {
1440
1441 if (Op == SIMemOp::LOAD) {
1442 Changed |= enableGLCBit(MI);
1443 Changed |= enableDLCBit(MI);
1444 }
1445
1446 // Ensure operation has completed at system scope to cause all volatile
1447 // operations to be visible outside the program in a global order. Do not
1448 // request cross address space as only the global address space can be
1449 // observable outside the program, so no need to cause a waitcnt for LDS
1450 // address space operations.
1451 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1452 Position::AFTER);
1453 return Changed;
1454 }
1455
1456 if (IsNonTemporal) {
1457 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1458 Changed |= enableSLCBit(MI);
1459 return Changed;
1460 }
1461
1462 return Changed;
1463 }
1464
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1465 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1466 SIAtomicScope Scope,
1467 SIAtomicAddrSpace AddrSpace,
1468 SIMemOp Op,
1469 bool IsCrossAddrSpaceOrdering,
1470 Position Pos) const {
1471 bool Changed = false;
1472
1473 MachineBasicBlock &MBB = *MI->getParent();
1474 DebugLoc DL = MI->getDebugLoc();
1475
1476 if (Pos == Position::AFTER)
1477 ++MI;
1478
1479 bool VMCnt = false;
1480 bool VSCnt = false;
1481 bool LGKMCnt = false;
1482
1483 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1484 SIAtomicAddrSpace::NONE) {
1485 switch (Scope) {
1486 case SIAtomicScope::SYSTEM:
1487 case SIAtomicScope::AGENT:
1488 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1489 VMCnt |= true;
1490 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1491 VSCnt |= true;
1492 break;
1493 case SIAtomicScope::WORKGROUP:
1494 // In WGP mode the waves of a work-group can be executing on either CU of
1495 // the WGP. Therefore need to wait for operations to complete to ensure
1496 // they are visible to waves in the other CU as the L0 is per CU.
1497 // Otherwise in CU mode and all waves of a work-group are on the same CU
1498 // which shares the same L0.
1499 if (!ST.isCuModeEnabled()) {
1500 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1501 VMCnt |= true;
1502 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1503 VSCnt |= true;
1504 }
1505 break;
1506 case SIAtomicScope::WAVEFRONT:
1507 case SIAtomicScope::SINGLETHREAD:
1508 // The L0 cache keeps all memory operations in order for
1509 // work-items in the same wavefront.
1510 break;
1511 default:
1512 llvm_unreachable("Unsupported synchronization scope");
1513 }
1514 }
1515
1516 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1517 switch (Scope) {
1518 case SIAtomicScope::SYSTEM:
1519 case SIAtomicScope::AGENT:
1520 case SIAtomicScope::WORKGROUP:
1521 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1522 // not needed as LDS operations for all waves are executed in a total
1523 // global ordering as observed by all waves. Required if also
1524 // synchronizing with global/GDS memory as LDS operations could be
1525 // reordered with respect to later global/GDS memory operations of the
1526 // same wave.
1527 LGKMCnt |= IsCrossAddrSpaceOrdering;
1528 break;
1529 case SIAtomicScope::WAVEFRONT:
1530 case SIAtomicScope::SINGLETHREAD:
1531 // The LDS keeps all memory operations in order for
1532 // the same wavesfront.
1533 break;
1534 default:
1535 llvm_unreachable("Unsupported synchronization scope");
1536 }
1537 }
1538
1539 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1540 switch (Scope) {
1541 case SIAtomicScope::SYSTEM:
1542 case SIAtomicScope::AGENT:
1543 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1544 // is not needed as GDS operations for all waves are executed in a total
1545 // global ordering as observed by all waves. Required if also
1546 // synchronizing with global/LDS memory as GDS operations could be
1547 // reordered with respect to later global/LDS memory operations of the
1548 // same wave.
1549 LGKMCnt |= IsCrossAddrSpaceOrdering;
1550 break;
1551 case SIAtomicScope::WORKGROUP:
1552 case SIAtomicScope::WAVEFRONT:
1553 case SIAtomicScope::SINGLETHREAD:
1554 // The GDS keeps all memory operations in order for
1555 // the same work-group.
1556 break;
1557 default:
1558 llvm_unreachable("Unsupported synchronization scope");
1559 }
1560 }
1561
1562 if (VMCnt || LGKMCnt) {
1563 unsigned WaitCntImmediate =
1564 AMDGPU::encodeWaitcnt(IV,
1565 VMCnt ? 0 : getVmcntBitMask(IV),
1566 getExpcntBitMask(IV),
1567 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1568 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1569 Changed = true;
1570 }
1571
1572 if (VSCnt) {
1573 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1574 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1575 .addImm(0);
1576 Changed = true;
1577 }
1578
1579 if (Pos == Position::AFTER)
1580 --MI;
1581
1582 return Changed;
1583 }
1584
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1585 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1586 SIAtomicScope Scope,
1587 SIAtomicAddrSpace AddrSpace,
1588 Position Pos) const {
1589 if (!InsertCacheInv)
1590 return false;
1591
1592 bool Changed = false;
1593
1594 MachineBasicBlock &MBB = *MI->getParent();
1595 DebugLoc DL = MI->getDebugLoc();
1596
1597 if (Pos == Position::AFTER)
1598 ++MI;
1599
1600 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1601 switch (Scope) {
1602 case SIAtomicScope::SYSTEM:
1603 case SIAtomicScope::AGENT:
1604 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1605 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1606 Changed = true;
1607 break;
1608 case SIAtomicScope::WORKGROUP:
1609 // In WGP mode the waves of a work-group can be executing on either CU of
1610 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1611 // in CU mode and all waves of a work-group are on the same CU, and so the
1612 // L0 does not need to be invalidated.
1613 if (!ST.isCuModeEnabled()) {
1614 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1615 Changed = true;
1616 }
1617 break;
1618 case SIAtomicScope::WAVEFRONT:
1619 case SIAtomicScope::SINGLETHREAD:
1620 // No cache to invalidate.
1621 break;
1622 default:
1623 llvm_unreachable("Unsupported synchronization scope");
1624 }
1625 }
1626
1627 /// The scratch address space does not need the global memory cache
1628 /// to be flushed as all memory operations by the same thread are
1629 /// sequentially consistent, and no other thread can access scratch
1630 /// memory.
1631
1632 /// Other address spaces do not have a cache.
1633
1634 if (Pos == Position::AFTER)
1635 --MI;
1636
1637 return Changed;
1638 }
1639
removeAtomicPseudoMIs()1640 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1641 if (AtomicPseudoMIs.empty())
1642 return false;
1643
1644 for (auto &MI : AtomicPseudoMIs)
1645 MI->eraseFromParent();
1646
1647 AtomicPseudoMIs.clear();
1648 return true;
1649 }
1650
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1651 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1652 MachineBasicBlock::iterator &MI) {
1653 assert(MI->mayLoad() && !MI->mayStore());
1654
1655 bool Changed = false;
1656
1657 if (MOI.isAtomic()) {
1658 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1659 MOI.getOrdering() == AtomicOrdering::Acquire ||
1660 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1661 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1662 MOI.getOrderingAddrSpace());
1663 }
1664
1665 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1666 Changed |= CC->insertWait(MI, MOI.getScope(),
1667 MOI.getOrderingAddrSpace(),
1668 SIMemOp::LOAD | SIMemOp::STORE,
1669 MOI.getIsCrossAddressSpaceOrdering(),
1670 Position::BEFORE);
1671
1672 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1673 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1674 Changed |= CC->insertWait(MI, MOI.getScope(),
1675 MOI.getInstrAddrSpace(),
1676 SIMemOp::LOAD,
1677 MOI.getIsCrossAddressSpaceOrdering(),
1678 Position::AFTER);
1679 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1680 MOI.getOrderingAddrSpace(),
1681 Position::AFTER);
1682 }
1683
1684 return Changed;
1685 }
1686
1687 // Atomic instructions already bypass caches to the scope specified by the
1688 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1689 // need additional treatment.
1690 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1691 SIMemOp::LOAD, MOI.isVolatile(),
1692 MOI.isNonTemporal());
1693 return Changed;
1694 }
1695
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1696 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1697 MachineBasicBlock::iterator &MI) {
1698 assert(!MI->mayLoad() && MI->mayStore());
1699
1700 bool Changed = false;
1701
1702 if (MOI.isAtomic()) {
1703 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1704 MOI.getOrdering() == AtomicOrdering::Release ||
1705 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1706 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1707 MOI.getOrderingAddrSpace());
1708 }
1709
1710 if (MOI.getOrdering() == AtomicOrdering::Release ||
1711 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1712 Changed |= CC->insertRelease(MI, MOI.getScope(),
1713 MOI.getOrderingAddrSpace(),
1714 MOI.getIsCrossAddressSpaceOrdering(),
1715 Position::BEFORE);
1716
1717 return Changed;
1718 }
1719
1720 // Atomic instructions already bypass caches to the scope specified by the
1721 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1722 // need additional treatment.
1723 Changed |= CC->enableVolatileAndOrNonTemporal(
1724 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1725 MOI.isNonTemporal());
1726 return Changed;
1727 }
1728
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1729 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1730 MachineBasicBlock::iterator &MI) {
1731 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1732
1733 AtomicPseudoMIs.push_back(MI);
1734 bool Changed = false;
1735
1736 if (MOI.isAtomic()) {
1737 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1738 MOI.getOrdering() == AtomicOrdering::Release ||
1739 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1740 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1741 /// TODO: This relies on a barrier always generating a waitcnt
1742 /// for LDS to ensure it is not reordered with the completion of
1743 /// the proceeding LDS operations. If barrier had a memory
1744 /// ordering and memory scope, then library does not need to
1745 /// generate a fence. Could add support in this file for
1746 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1747 /// adding S_WAITCNT before a S_BARRIER.
1748 Changed |= CC->insertRelease(MI, MOI.getScope(),
1749 MOI.getOrderingAddrSpace(),
1750 MOI.getIsCrossAddressSpaceOrdering(),
1751 Position::BEFORE);
1752
1753 // TODO: If both release and invalidate are happening they could be combined
1754 // to use the single "BUFFER_WBINV*" instruction. This could be done by
1755 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1756 // track cache invalidate and write back instructions.
1757
1758 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1759 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1760 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1761 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1762 MOI.getOrderingAddrSpace(),
1763 Position::BEFORE);
1764
1765 return Changed;
1766 }
1767
1768 return Changed;
1769 }
1770
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1771 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1772 MachineBasicBlock::iterator &MI) {
1773 assert(MI->mayLoad() && MI->mayStore());
1774
1775 bool Changed = false;
1776
1777 if (MOI.isAtomic()) {
1778 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1779 MOI.getOrdering() == AtomicOrdering::Acquire ||
1780 MOI.getOrdering() == AtomicOrdering::Release ||
1781 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1782 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1783 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1784 MOI.getInstrAddrSpace());
1785 }
1786
1787 if (MOI.getOrdering() == AtomicOrdering::Release ||
1788 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1789 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1790 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1791 Changed |= CC->insertRelease(MI, MOI.getScope(),
1792 MOI.getOrderingAddrSpace(),
1793 MOI.getIsCrossAddressSpaceOrdering(),
1794 Position::BEFORE);
1795
1796 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1797 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1798 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1799 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1800 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1801 Changed |= CC->insertWait(MI, MOI.getScope(),
1802 MOI.getInstrAddrSpace(),
1803 isAtomicRet(*MI) ? SIMemOp::LOAD :
1804 SIMemOp::STORE,
1805 MOI.getIsCrossAddressSpaceOrdering(),
1806 Position::AFTER);
1807 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1808 MOI.getOrderingAddrSpace(),
1809 Position::AFTER);
1810 }
1811
1812 return Changed;
1813 }
1814
1815 return Changed;
1816 }
1817
runOnMachineFunction(MachineFunction & MF)1818 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1819 bool Changed = false;
1820
1821 SIMemOpAccess MOA(MF);
1822 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1823
1824 for (auto &MBB : MF) {
1825 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1826
1827 // Unbundle instructions after the post-RA scheduler.
1828 if (MI->isBundle() && MI->mayLoadOrStore()) {
1829 MachineBasicBlock::instr_iterator II(MI->getIterator());
1830 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1831 I != E && I->isBundledWithPred(); ++I) {
1832 I->unbundleFromPred();
1833 for (MachineOperand &MO : I->operands())
1834 if (MO.isReg())
1835 MO.setIsInternalRead(false);
1836 }
1837
1838 MI->eraseFromParent();
1839 MI = II->getIterator();
1840 }
1841
1842 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1843 continue;
1844
1845 if (const auto &MOI = MOA.getLoadInfo(MI))
1846 Changed |= expandLoad(MOI.getValue(), MI);
1847 else if (const auto &MOI = MOA.getStoreInfo(MI))
1848 Changed |= expandStore(MOI.getValue(), MI);
1849 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1850 Changed |= expandAtomicFence(MOI.getValue(), MI);
1851 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1852 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1853 }
1854 }
1855
1856 Changed |= removeAtomicPseudoMIs();
1857 return Changed;
1858 }
1859
1860 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1861
1862 char SIMemoryLegalizer::ID = 0;
1863 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1864
createSIMemoryLegalizerPass()1865 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1866 return new SIMemoryLegalizer();
1867 }
1868