1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34 cl::desc("Use this to skip inserting cache invalidating instructions."));
35
36 namespace {
37
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42 NONE = 0u,
43 LOAD = 1u << 0,
44 STORE = 1u << 1,
45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51 BEFORE,
52 AFTER
53 };
54
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57 NONE,
58 SINGLETHREAD,
59 WAVEFRONT,
60 WORKGROUP,
61 AGENT,
62 SYSTEM
63 };
64
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68 NONE = 0u,
69 GLOBAL = 1u << 0,
70 LDS = 1u << 1,
71 SCRATCH = 1u << 2,
72 GDS = 1u << 3,
73 OTHER = 1u << 4,
74
75 /// The address spaces that can be accessed by a FLAT instruction.
76 FLAT = GLOBAL | LDS | SCRATCH,
77
78 /// The address spaces that support atomic instructions.
79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80
81 /// All address spaces.
82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83
84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86
87 class SIMemOpInfo final {
88 private:
89
90 friend class SIMemOpAccess;
91
92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97 bool IsCrossAddressSpaceOrdering = false;
98 bool IsVolatile = false;
99 bool IsNonTemporal = false;
100
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105 bool IsCrossAddressSpaceOrdering = true,
106 AtomicOrdering FailureOrdering =
107 AtomicOrdering::SequentiallyConsistent,
108 bool IsVolatile = false,
109 bool IsNonTemporal = false)
110 : Ordering(Ordering), FailureOrdering(FailureOrdering),
111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112 InstrAddrSpace(InstrAddrSpace),
113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114 IsVolatile(IsVolatile),
115 IsNonTemporal(IsNonTemporal) {
116
117 if (Ordering == AtomicOrdering::NotAtomic) {
118 assert(Scope == SIAtomicScope::NONE &&
119 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120 !IsCrossAddressSpaceOrdering &&
121 FailureOrdering == AtomicOrdering::NotAtomic);
122 return;
123 }
124
125 assert(Scope != SIAtomicScope::NONE &&
126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127 SIAtomicAddrSpace::NONE &&
128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129 SIAtomicAddrSpace::NONE &&
130 !isStrongerThan(FailureOrdering, Ordering));
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155 public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
getScope() const158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
getOrdering() const164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
getFailureOrdering() const170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210 };
211
212 class SIMemOpAccess final {
213 private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 Optional<SIMemOpInfo> constructFromMIWithMMO(
233 const MachineBasicBlock::iterator &MI) const;
234
235 public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "None" otherwise.
241 Optional<SIMemOpInfo> getLoadInfo(
242 const MachineBasicBlock::iterator &MI) const;
243
244 /// \returns Store info if \p MI is a store operation, "None" otherwise.
245 Optional<SIMemOpInfo> getStoreInfo(
246 const MachineBasicBlock::iterator &MI) const;
247
248 /// \returns Atomic fence info if \p MI is an atomic fence operation,
249 /// "None" otherwise.
250 Optional<SIMemOpInfo> getAtomicFenceInfo(
251 const MachineBasicBlock::iterator &MI) const;
252
253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254 /// rmw operation, "None" otherwise.
255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256 const MachineBasicBlock::iterator &MI) const;
257 };
258
259 class SICacheControl {
260 protected:
261
262 /// AMDGPU subtarget info.
263 const GCNSubtarget &ST;
264
265 /// Instruction info.
266 const SIInstrInfo *TII = nullptr;
267
268 IsaVersion IV;
269
270 /// Whether to insert cache invalidating instructions.
271 bool InsertCacheInv;
272
273 SICacheControl(const GCNSubtarget &ST);
274
275 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276 /// \returns Returns true if \p MI is modified, false otherwise.
277 bool enableNamedBit(const MachineBasicBlock::iterator MI,
278 AMDGPU::CPol::CPol Bit) const;
279
280 public:
281
282 /// Create a cache control for the subtarget \p ST.
283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284
285 /// Update \p MI memory load instruction to bypass any caches up to
286 /// the \p Scope memory scope for address spaces \p
287 /// AddrSpace. Return true iff the instruction was modified.
288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289 SIAtomicScope Scope,
290 SIAtomicAddrSpace AddrSpace) const = 0;
291
292 /// Update \p MI memory store instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory read-modify-write instruction to bypass any caches up
300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301 /// iff the instruction was modified.
302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory instruction of kind \p Op associated with address
307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308 /// true iff the instruction was modified.
309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310 SIAtomicAddrSpace AddrSpace,
311 SIMemOp Op, bool IsVolatile,
312 bool IsNonTemporal) const = 0;
313
314 /// Inserts any necessary instructions at position \p Pos relative
315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316 /// \p Op associated with address spaces \p AddrSpace have completed. Used
317 /// between memory instructions to enforce the order they become visible as
318 /// observed by other memory instructions executing in memory scope \p Scope.
319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320 /// address spaces. Returns true iff any instructions inserted.
321 virtual bool insertWait(MachineBasicBlock::iterator &MI,
322 SIAtomicScope Scope,
323 SIAtomicAddrSpace AddrSpace,
324 SIMemOp Op,
325 bool IsCrossAddrSpaceOrdering,
326 Position Pos) const = 0;
327
328 /// Inserts any necessary instructions at position \p Pos relative to
329 /// instruction \p MI to ensure any subsequent memory instructions of this
330 /// thread with address spaces \p AddrSpace will observe the previous memory
331 /// operations by any thread for memory scopes up to memory scope \p Scope .
332 /// Returns true iff any instructions inserted.
333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 Position Pos) const = 0;
337
338 /// Inserts any necessary instructions at position \p Pos relative to
339 /// instruction \p MI to ensure previous memory instructions by this thread
340 /// with address spaces \p AddrSpace have completed and can be observed by
341 /// subsequent memory instructions by any thread executing in memory scope \p
342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343 /// between address spaces. Returns true iff any instructions inserted.
344 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345 SIAtomicScope Scope,
346 SIAtomicAddrSpace AddrSpace,
347 bool IsCrossAddrSpaceOrdering,
348 Position Pos) const = 0;
349
350 /// Virtual destructor to allow derivations to be deleted.
351 virtual ~SICacheControl() = default;
352
353 };
354
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357
358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361 return enableNamedBit(MI, AMDGPU::CPol::GLC);
362 }
363
364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367 return enableNamedBit(MI, AMDGPU::CPol::SLC);
368 }
369
370 public:
371
SIGfx6CacheControl(const GCNSubtarget & ST)372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373
374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375 SIAtomicScope Scope,
376 SIAtomicAddrSpace AddrSpace) const override;
377
378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379 SIAtomicScope Scope,
380 SIAtomicAddrSpace AddrSpace) const override;
381
382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383 SIAtomicScope Scope,
384 SIAtomicAddrSpace AddrSpace) const override;
385
386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388 bool IsVolatile,
389 bool IsNonTemporal) const override;
390
391 bool insertWait(MachineBasicBlock::iterator &MI,
392 SIAtomicScope Scope,
393 SIAtomicAddrSpace AddrSpace,
394 SIMemOp Op,
395 bool IsCrossAddrSpaceOrdering,
396 Position Pos) const override;
397
398 bool insertAcquire(MachineBasicBlock::iterator &MI,
399 SIAtomicScope Scope,
400 SIAtomicAddrSpace AddrSpace,
401 Position Pos) const override;
402
403 bool insertRelease(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 bool IsCrossAddrSpaceOrdering,
407 Position Pos) const override;
408 };
409
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412
SIGfx7CacheControl(const GCNSubtarget & ST)413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414
415 bool insertAcquire(MachineBasicBlock::iterator &MI,
416 SIAtomicScope Scope,
417 SIAtomicAddrSpace AddrSpace,
418 Position Pos) const override;
419
420 };
421
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424
SIGfx90ACacheControl(const GCNSubtarget & ST)425 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
426
427 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428 SIAtomicScope Scope,
429 SIAtomicAddrSpace AddrSpace) const override;
430
431 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace) const override;
434
435 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436 SIAtomicScope Scope,
437 SIAtomicAddrSpace AddrSpace) const override;
438
439 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441 bool IsVolatile,
442 bool IsNonTemporal) const override;
443
444 bool insertWait(MachineBasicBlock::iterator &MI,
445 SIAtomicScope Scope,
446 SIAtomicAddrSpace AddrSpace,
447 SIMemOp Op,
448 bool IsCrossAddrSpaceOrdering,
449 Position Pos) const override;
450
451 bool insertAcquire(MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace,
454 Position Pos) const override;
455 };
456
457 class SIGfx10CacheControl : public SIGfx7CacheControl {
458 protected:
459
460 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
461 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const462 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
463 return enableNamedBit(MI, AMDGPU::CPol::DLC);
464 }
465
466 public:
467
SIGfx10CacheControl(const GCNSubtarget & ST)468 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
469
470 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
471 SIAtomicScope Scope,
472 SIAtomicAddrSpace AddrSpace) const override;
473
474 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
475 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
476 bool IsVolatile,
477 bool IsNonTemporal) const override;
478
479 bool insertWait(MachineBasicBlock::iterator &MI,
480 SIAtomicScope Scope,
481 SIAtomicAddrSpace AddrSpace,
482 SIMemOp Op,
483 bool IsCrossAddrSpaceOrdering,
484 Position Pos) const override;
485
486 bool insertAcquire(MachineBasicBlock::iterator &MI,
487 SIAtomicScope Scope,
488 SIAtomicAddrSpace AddrSpace,
489 Position Pos) const override;
490 };
491
492 class SIMemoryLegalizer final : public MachineFunctionPass {
493 private:
494
495 /// Cache Control.
496 std::unique_ptr<SICacheControl> CC = nullptr;
497
498 /// List of atomic pseudo instructions.
499 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
500
501 /// Return true iff instruction \p MI is a atomic instruction that
502 /// returns a result.
isAtomicRet(const MachineInstr & MI) const503 bool isAtomicRet(const MachineInstr &MI) const {
504 return SIInstrInfo::isAtomicRet(MI);
505 }
506
507 /// Removes all processed atomic pseudo instructions from the current
508 /// function. Returns true if current function is modified, false otherwise.
509 bool removeAtomicPseudoMIs();
510
511 /// Expands load operation \p MI. Returns true if instructions are
512 /// added/deleted or \p MI is modified, false otherwise.
513 bool expandLoad(const SIMemOpInfo &MOI,
514 MachineBasicBlock::iterator &MI);
515 /// Expands store operation \p MI. Returns true if instructions are
516 /// added/deleted or \p MI is modified, false otherwise.
517 bool expandStore(const SIMemOpInfo &MOI,
518 MachineBasicBlock::iterator &MI);
519 /// Expands atomic fence operation \p MI. Returns true if
520 /// instructions are added/deleted or \p MI is modified, false otherwise.
521 bool expandAtomicFence(const SIMemOpInfo &MOI,
522 MachineBasicBlock::iterator &MI);
523 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
524 /// instructions are added/deleted or \p MI is modified, false otherwise.
525 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
526 MachineBasicBlock::iterator &MI);
527
528 public:
529 static char ID;
530
SIMemoryLegalizer()531 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
532
getAnalysisUsage(AnalysisUsage & AU) const533 void getAnalysisUsage(AnalysisUsage &AU) const override {
534 AU.setPreservesCFG();
535 MachineFunctionPass::getAnalysisUsage(AU);
536 }
537
getPassName() const538 StringRef getPassName() const override {
539 return PASS_NAME;
540 }
541
542 bool runOnMachineFunction(MachineFunction &MF) override;
543 };
544
545 } // end namespace anonymous
546
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const547 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
548 const char *Msg) const {
549 const Function &Func = MI->getParent()->getParent()->getFunction();
550 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
551 Func.getContext().diagnose(Diag);
552 }
553
554 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const555 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
556 SIAtomicAddrSpace InstrAddrSpace) const {
557 if (SSID == SyncScope::System)
558 return std::make_tuple(SIAtomicScope::SYSTEM,
559 SIAtomicAddrSpace::ATOMIC,
560 true);
561 if (SSID == MMI->getAgentSSID())
562 return std::make_tuple(SIAtomicScope::AGENT,
563 SIAtomicAddrSpace::ATOMIC,
564 true);
565 if (SSID == MMI->getWorkgroupSSID())
566 return std::make_tuple(SIAtomicScope::WORKGROUP,
567 SIAtomicAddrSpace::ATOMIC,
568 true);
569 if (SSID == MMI->getWavefrontSSID())
570 return std::make_tuple(SIAtomicScope::WAVEFRONT,
571 SIAtomicAddrSpace::ATOMIC,
572 true);
573 if (SSID == SyncScope::SingleThread)
574 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
575 SIAtomicAddrSpace::ATOMIC,
576 true);
577 if (SSID == MMI->getSystemOneAddressSpaceSSID())
578 return std::make_tuple(SIAtomicScope::SYSTEM,
579 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
580 false);
581 if (SSID == MMI->getAgentOneAddressSpaceSSID())
582 return std::make_tuple(SIAtomicScope::AGENT,
583 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
584 false);
585 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
586 return std::make_tuple(SIAtomicScope::WORKGROUP,
587 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
588 false);
589 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
590 return std::make_tuple(SIAtomicScope::WAVEFRONT,
591 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
592 false);
593 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
594 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
595 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
596 false);
597 return None;
598 }
599
toSIAtomicAddrSpace(unsigned AS) const600 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
601 if (AS == AMDGPUAS::FLAT_ADDRESS)
602 return SIAtomicAddrSpace::FLAT;
603 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
604 return SIAtomicAddrSpace::GLOBAL;
605 if (AS == AMDGPUAS::LOCAL_ADDRESS)
606 return SIAtomicAddrSpace::LDS;
607 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
608 return SIAtomicAddrSpace::SCRATCH;
609 if (AS == AMDGPUAS::REGION_ADDRESS)
610 return SIAtomicAddrSpace::GDS;
611
612 return SIAtomicAddrSpace::OTHER;
613 }
614
SIMemOpAccess(MachineFunction & MF)615 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
616 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
617 }
618
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const619 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
620 const MachineBasicBlock::iterator &MI) const {
621 assert(MI->getNumMemOperands() > 0);
622
623 SyncScope::ID SSID = SyncScope::SingleThread;
624 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
625 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
626 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
627 bool IsNonTemporal = true;
628 bool IsVolatile = false;
629
630 // Validator should check whether or not MMOs cover the entire set of
631 // locations accessed by the memory instruction.
632 for (const auto &MMO : MI->memoperands()) {
633 IsNonTemporal &= MMO->isNonTemporal();
634 IsVolatile |= MMO->isVolatile();
635 InstrAddrSpace |=
636 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
637 AtomicOrdering OpOrdering = MMO->getOrdering();
638 if (OpOrdering != AtomicOrdering::NotAtomic) {
639 const auto &IsSyncScopeInclusion =
640 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
641 if (!IsSyncScopeInclusion) {
642 reportUnsupported(MI,
643 "Unsupported non-inclusive atomic synchronization scope");
644 return None;
645 }
646
647 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
648 Ordering =
649 isStrongerThan(Ordering, OpOrdering) ?
650 Ordering : MMO->getOrdering();
651 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
652 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
653 FailureOrdering =
654 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
655 FailureOrdering : MMO->getFailureOrdering();
656 }
657 }
658
659 SIAtomicScope Scope = SIAtomicScope::NONE;
660 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
661 bool IsCrossAddressSpaceOrdering = false;
662 if (Ordering != AtomicOrdering::NotAtomic) {
663 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
664 if (!ScopeOrNone) {
665 reportUnsupported(MI, "Unsupported atomic synchronization scope");
666 return None;
667 }
668 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
669 ScopeOrNone.getValue();
670 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
671 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
672 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
673 reportUnsupported(MI, "Unsupported atomic address space");
674 return None;
675 }
676 }
677 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
678 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
679 IsNonTemporal);
680 }
681
getLoadInfo(const MachineBasicBlock::iterator & MI) const682 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
683 const MachineBasicBlock::iterator &MI) const {
684 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
685
686 if (!(MI->mayLoad() && !MI->mayStore()))
687 return None;
688
689 // Be conservative if there are no memory operands.
690 if (MI->getNumMemOperands() == 0)
691 return SIMemOpInfo();
692
693 return constructFromMIWithMMO(MI);
694 }
695
getStoreInfo(const MachineBasicBlock::iterator & MI) const696 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
697 const MachineBasicBlock::iterator &MI) const {
698 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
699
700 if (!(!MI->mayLoad() && MI->mayStore()))
701 return None;
702
703 // Be conservative if there are no memory operands.
704 if (MI->getNumMemOperands() == 0)
705 return SIMemOpInfo();
706
707 return constructFromMIWithMMO(MI);
708 }
709
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const710 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
711 const MachineBasicBlock::iterator &MI) const {
712 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
713
714 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
715 return None;
716
717 AtomicOrdering Ordering =
718 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
719
720 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
721 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
722 if (!ScopeOrNone) {
723 reportUnsupported(MI, "Unsupported atomic synchronization scope");
724 return None;
725 }
726
727 SIAtomicScope Scope = SIAtomicScope::NONE;
728 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
729 bool IsCrossAddressSpaceOrdering = false;
730 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
731 ScopeOrNone.getValue();
732
733 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
734 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
735 reportUnsupported(MI, "Unsupported atomic address space");
736 return None;
737 }
738
739 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
740 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
741 }
742
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const743 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
744 const MachineBasicBlock::iterator &MI) const {
745 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
746
747 if (!(MI->mayLoad() && MI->mayStore()))
748 return None;
749
750 // Be conservative if there are no memory operands.
751 if (MI->getNumMemOperands() == 0)
752 return SIMemOpInfo();
753
754 return constructFromMIWithMMO(MI);
755 }
756
SICacheControl(const GCNSubtarget & ST)757 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
758 TII = ST.getInstrInfo();
759 IV = getIsaVersion(ST.getCPU());
760 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
761 }
762
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const763 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
764 AMDGPU::CPol::CPol Bit) const {
765 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
766 if (!CPol)
767 return false;
768
769 CPol->setImm(CPol->getImm() | Bit);
770 return true;
771 }
772
773 /* static */
create(const GCNSubtarget & ST)774 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
775 GCNSubtarget::Generation Generation = ST.getGeneration();
776 if (ST.hasGFX90AInsts())
777 return std::make_unique<SIGfx90ACacheControl>(ST);
778 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
779 return std::make_unique<SIGfx6CacheControl>(ST);
780 if (Generation < AMDGPUSubtarget::GFX10)
781 return std::make_unique<SIGfx7CacheControl>(ST);
782 return std::make_unique<SIGfx10CacheControl>(ST);
783 }
784
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const785 bool SIGfx6CacheControl::enableLoadCacheBypass(
786 const MachineBasicBlock::iterator &MI,
787 SIAtomicScope Scope,
788 SIAtomicAddrSpace AddrSpace) const {
789 assert(MI->mayLoad() && !MI->mayStore());
790 bool Changed = false;
791
792 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
793 switch (Scope) {
794 case SIAtomicScope::SYSTEM:
795 case SIAtomicScope::AGENT:
796 Changed |= enableGLCBit(MI);
797 break;
798 case SIAtomicScope::WORKGROUP:
799 case SIAtomicScope::WAVEFRONT:
800 case SIAtomicScope::SINGLETHREAD:
801 // No cache to bypass.
802 break;
803 default:
804 llvm_unreachable("Unsupported synchronization scope");
805 }
806 }
807
808 /// The scratch address space does not need the global memory caches
809 /// to be bypassed as all memory operations by the same thread are
810 /// sequentially consistent, and no other thread can access scratch
811 /// memory.
812
813 /// Other address spaces do not have a cache.
814
815 return Changed;
816 }
817
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const818 bool SIGfx6CacheControl::enableStoreCacheBypass(
819 const MachineBasicBlock::iterator &MI,
820 SIAtomicScope Scope,
821 SIAtomicAddrSpace AddrSpace) const {
822 assert(!MI->mayLoad() && MI->mayStore());
823 bool Changed = false;
824
825 /// The L1 cache is write through so does not need to be bypassed. There is no
826 /// bypass control for the L2 cache at the isa level.
827
828 return Changed;
829 }
830
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const831 bool SIGfx6CacheControl::enableRMWCacheBypass(
832 const MachineBasicBlock::iterator &MI,
833 SIAtomicScope Scope,
834 SIAtomicAddrSpace AddrSpace) const {
835 assert(MI->mayLoad() && MI->mayStore());
836 bool Changed = false;
837
838 /// The L1 cache is write through so does not need to be bypassed. There is no
839 /// bypass control for the L2 cache at the isa level.
840
841 return Changed;
842 }
843
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const844 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
845 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
846 bool IsVolatile, bool IsNonTemporal) const {
847 // Only handle load and store, not atomic read-modify-write insructions. The
848 // latter use glc to indicate if the atomic returns a result and so must not
849 // be used for cache control.
850 assert(MI->mayLoad() ^ MI->mayStore());
851
852 // Only update load and store, not LLVM IR atomic read-modify-write
853 // instructions. The latter are always marked as volatile so cannot sensibly
854 // handle it as do not want to pessimize all atomics. Also they do not support
855 // the nontemporal attribute.
856 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
857
858 bool Changed = false;
859
860 if (IsVolatile) {
861 if (Op == SIMemOp::LOAD)
862 Changed |= enableGLCBit(MI);
863
864 // Ensure operation has completed at system scope to cause all volatile
865 // operations to be visible outside the program in a global order. Do not
866 // request cross address space as only the global address space can be
867 // observable outside the program, so no need to cause a waitcnt for LDS
868 // address space operations.
869 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
870 Position::AFTER);
871
872 return Changed;
873 }
874
875 if (IsNonTemporal) {
876 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
877 Changed |= enableGLCBit(MI);
878 Changed |= enableSLCBit(MI);
879 return Changed;
880 }
881
882 return Changed;
883 }
884
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const885 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
886 SIAtomicScope Scope,
887 SIAtomicAddrSpace AddrSpace,
888 SIMemOp Op,
889 bool IsCrossAddrSpaceOrdering,
890 Position Pos) const {
891 bool Changed = false;
892
893 MachineBasicBlock &MBB = *MI->getParent();
894 DebugLoc DL = MI->getDebugLoc();
895
896 if (Pos == Position::AFTER)
897 ++MI;
898
899 bool VMCnt = false;
900 bool LGKMCnt = false;
901
902 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
903 SIAtomicAddrSpace::NONE) {
904 switch (Scope) {
905 case SIAtomicScope::SYSTEM:
906 case SIAtomicScope::AGENT:
907 VMCnt |= true;
908 break;
909 case SIAtomicScope::WORKGROUP:
910 case SIAtomicScope::WAVEFRONT:
911 case SIAtomicScope::SINGLETHREAD:
912 // The L1 cache keeps all memory operations in order for
913 // wavefronts in the same work-group.
914 break;
915 default:
916 llvm_unreachable("Unsupported synchronization scope");
917 }
918 }
919
920 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
921 switch (Scope) {
922 case SIAtomicScope::SYSTEM:
923 case SIAtomicScope::AGENT:
924 case SIAtomicScope::WORKGROUP:
925 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
926 // not needed as LDS operations for all waves are executed in a total
927 // global ordering as observed by all waves. Required if also
928 // synchronizing with global/GDS memory as LDS operations could be
929 // reordered with respect to later global/GDS memory operations of the
930 // same wave.
931 LGKMCnt |= IsCrossAddrSpaceOrdering;
932 break;
933 case SIAtomicScope::WAVEFRONT:
934 case SIAtomicScope::SINGLETHREAD:
935 // The LDS keeps all memory operations in order for
936 // the same wavesfront.
937 break;
938 default:
939 llvm_unreachable("Unsupported synchronization scope");
940 }
941 }
942
943 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
944 switch (Scope) {
945 case SIAtomicScope::SYSTEM:
946 case SIAtomicScope::AGENT:
947 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
948 // is not needed as GDS operations for all waves are executed in a total
949 // global ordering as observed by all waves. Required if also
950 // synchronizing with global/LDS memory as GDS operations could be
951 // reordered with respect to later global/LDS memory operations of the
952 // same wave.
953 LGKMCnt |= IsCrossAddrSpaceOrdering;
954 break;
955 case SIAtomicScope::WORKGROUP:
956 case SIAtomicScope::WAVEFRONT:
957 case SIAtomicScope::SINGLETHREAD:
958 // The GDS keeps all memory operations in order for
959 // the same work-group.
960 break;
961 default:
962 llvm_unreachable("Unsupported synchronization scope");
963 }
964 }
965
966 if (VMCnt || LGKMCnt) {
967 unsigned WaitCntImmediate =
968 AMDGPU::encodeWaitcnt(IV,
969 VMCnt ? 0 : getVmcntBitMask(IV),
970 getExpcntBitMask(IV),
971 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
972 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
973 Changed = true;
974 }
975
976 if (Pos == Position::AFTER)
977 --MI;
978
979 return Changed;
980 }
981
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const982 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
983 SIAtomicScope Scope,
984 SIAtomicAddrSpace AddrSpace,
985 Position Pos) const {
986 if (!InsertCacheInv)
987 return false;
988
989 bool Changed = false;
990
991 MachineBasicBlock &MBB = *MI->getParent();
992 DebugLoc DL = MI->getDebugLoc();
993
994 if (Pos == Position::AFTER)
995 ++MI;
996
997 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
998 switch (Scope) {
999 case SIAtomicScope::SYSTEM:
1000 case SIAtomicScope::AGENT:
1001 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1002 Changed = true;
1003 break;
1004 case SIAtomicScope::WORKGROUP:
1005 case SIAtomicScope::WAVEFRONT:
1006 case SIAtomicScope::SINGLETHREAD:
1007 // No cache to invalidate.
1008 break;
1009 default:
1010 llvm_unreachable("Unsupported synchronization scope");
1011 }
1012 }
1013
1014 /// The scratch address space does not need the global memory cache
1015 /// to be flushed as all memory operations by the same thread are
1016 /// sequentially consistent, and no other thread can access scratch
1017 /// memory.
1018
1019 /// Other address spaces do not have a cache.
1020
1021 if (Pos == Position::AFTER)
1022 --MI;
1023
1024 return Changed;
1025 }
1026
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1027 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1028 SIAtomicScope Scope,
1029 SIAtomicAddrSpace AddrSpace,
1030 bool IsCrossAddrSpaceOrdering,
1031 Position Pos) const {
1032 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1033 IsCrossAddrSpaceOrdering, Pos);
1034 }
1035
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1036 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1037 SIAtomicScope Scope,
1038 SIAtomicAddrSpace AddrSpace,
1039 Position Pos) const {
1040 if (!InsertCacheInv)
1041 return false;
1042
1043 bool Changed = false;
1044
1045 MachineBasicBlock &MBB = *MI->getParent();
1046 DebugLoc DL = MI->getDebugLoc();
1047
1048 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1049
1050 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1051 ? AMDGPU::BUFFER_WBINVL1
1052 : AMDGPU::BUFFER_WBINVL1_VOL;
1053
1054 if (Pos == Position::AFTER)
1055 ++MI;
1056
1057 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1058 switch (Scope) {
1059 case SIAtomicScope::SYSTEM:
1060 case SIAtomicScope::AGENT:
1061 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1062 Changed = true;
1063 break;
1064 case SIAtomicScope::WORKGROUP:
1065 case SIAtomicScope::WAVEFRONT:
1066 case SIAtomicScope::SINGLETHREAD:
1067 // No cache to invalidate.
1068 break;
1069 default:
1070 llvm_unreachable("Unsupported synchronization scope");
1071 }
1072 }
1073
1074 /// The scratch address space does not need the global memory cache
1075 /// to be flushed as all memory operations by the same thread are
1076 /// sequentially consistent, and no other thread can access scratch
1077 /// memory.
1078
1079 /// Other address spaces do not have a cache.
1080
1081 if (Pos == Position::AFTER)
1082 --MI;
1083
1084 return Changed;
1085 }
1086
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1087 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1088 const MachineBasicBlock::iterator &MI,
1089 SIAtomicScope Scope,
1090 SIAtomicAddrSpace AddrSpace) const {
1091 assert(MI->mayLoad() && !MI->mayStore());
1092 bool Changed = false;
1093
1094 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1095 switch (Scope) {
1096 case SIAtomicScope::SYSTEM:
1097 case SIAtomicScope::AGENT:
1098 Changed |= enableGLCBit(MI);
1099 break;
1100 case SIAtomicScope::WORKGROUP:
1101 // In threadgroup split mode the waves of a work-group can be executing on
1102 // different CUs. Therefore need to bypass the L1 which is per CU.
1103 // Otherwise in non-threadgroup split mode all waves of a work-group are
1104 // on the same CU, and so the L1 does not need to be bypassed.
1105 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1106 break;
1107 case SIAtomicScope::WAVEFRONT:
1108 case SIAtomicScope::SINGLETHREAD:
1109 // No cache to bypass.
1110 break;
1111 default:
1112 llvm_unreachable("Unsupported synchronization scope");
1113 }
1114 }
1115
1116 /// The scratch address space does not need the global memory caches
1117 /// to be bypassed as all memory operations by the same thread are
1118 /// sequentially consistent, and no other thread can access scratch
1119 /// memory.
1120
1121 /// Other address spaces do not have a cache.
1122
1123 return Changed;
1124 }
1125
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1126 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1127 const MachineBasicBlock::iterator &MI,
1128 SIAtomicScope Scope,
1129 SIAtomicAddrSpace AddrSpace) const {
1130 assert(!MI->mayLoad() && MI->mayStore());
1131 bool Changed = false;
1132
1133 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1134 switch (Scope) {
1135 case SIAtomicScope::SYSTEM:
1136 case SIAtomicScope::AGENT:
1137 /// Do not set glc for store atomic operations as they implicitly write
1138 /// through the L1 cache.
1139 break;
1140 case SIAtomicScope::WORKGROUP:
1141 case SIAtomicScope::WAVEFRONT:
1142 case SIAtomicScope::SINGLETHREAD:
1143 // No cache to bypass. Store atomics implicitly write through the L1
1144 // cache.
1145 break;
1146 default:
1147 llvm_unreachable("Unsupported synchronization scope");
1148 }
1149 }
1150
1151 /// The scratch address space does not need the global memory caches
1152 /// to be bypassed as all memory operations by the same thread are
1153 /// sequentially consistent, and no other thread can access scratch
1154 /// memory.
1155
1156 /// Other address spaces do not have a cache.
1157
1158 return Changed;
1159 }
1160
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1161 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1162 const MachineBasicBlock::iterator &MI,
1163 SIAtomicScope Scope,
1164 SIAtomicAddrSpace AddrSpace) const {
1165 assert(MI->mayLoad() && MI->mayStore());
1166 bool Changed = false;
1167
1168 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1169 switch (Scope) {
1170 case SIAtomicScope::SYSTEM:
1171 case SIAtomicScope::AGENT:
1172 /// Do not set glc for RMW atomic operations as they implicitly bypass
1173 /// the L1 cache, and the glc bit is instead used to indicate if they are
1174 /// return or no-return.
1175 break;
1176 case SIAtomicScope::WORKGROUP:
1177 case SIAtomicScope::WAVEFRONT:
1178 case SIAtomicScope::SINGLETHREAD:
1179 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1180 break;
1181 default:
1182 llvm_unreachable("Unsupported synchronization scope");
1183 }
1184 }
1185
1186 return Changed;
1187 }
1188
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1189 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1190 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1191 bool IsVolatile, bool IsNonTemporal) const {
1192 // Only handle load and store, not atomic read-modify-write insructions. The
1193 // latter use glc to indicate if the atomic returns a result and so must not
1194 // be used for cache control.
1195 assert(MI->mayLoad() ^ MI->mayStore());
1196
1197 // Only update load and store, not LLVM IR atomic read-modify-write
1198 // instructions. The latter are always marked as volatile so cannot sensibly
1199 // handle it as do not want to pessimize all atomics. Also they do not support
1200 // the nontemporal attribute.
1201 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1202
1203 bool Changed = false;
1204
1205 if (IsVolatile) {
1206 if (Op == SIMemOp::LOAD) {
1207 Changed |= enableGLCBit(MI);
1208 }
1209
1210 // Ensure operation has completed at system scope to cause all volatile
1211 // operations to be visible outside the program in a global order. Do not
1212 // request cross address space as only the global address space can be
1213 // observable outside the program, so no need to cause a waitcnt for LDS
1214 // address space operations.
1215 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1216 Position::AFTER);
1217
1218 return Changed;
1219 }
1220
1221 if (IsNonTemporal) {
1222 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1223 Changed |= enableGLCBit(MI);
1224 Changed |= enableSLCBit(MI);
1225 return Changed;
1226 }
1227
1228 return Changed;
1229 }
1230
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1231 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1232 SIAtomicScope Scope,
1233 SIAtomicAddrSpace AddrSpace,
1234 SIMemOp Op,
1235 bool IsCrossAddrSpaceOrdering,
1236 Position Pos) const {
1237 if (ST.isTgSplitEnabled()) {
1238 // In threadgroup split mode the waves of a work-group can be executing on
1239 // different CUs. Therefore need to wait for global or GDS memory operations
1240 // to complete to ensure they are visible to waves in the other CUs.
1241 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1242 // the same CU, so no need to wait for global memory as all waves in the
1243 // work-group access the same the L1, nor wait for GDS as access are ordered
1244 // on a CU.
1245 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1246 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1247 (Scope == SIAtomicScope::WORKGROUP)) {
1248 // Same as GFX7 using agent scope.
1249 Scope = SIAtomicScope::AGENT;
1250 }
1251 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1252 // LDS memory operations.
1253 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1254 }
1255 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1256 IsCrossAddrSpaceOrdering, Pos);
1257 }
1258
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1259 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1260 SIAtomicScope Scope,
1261 SIAtomicAddrSpace AddrSpace,
1262 Position Pos) const {
1263 if (!InsertCacheInv)
1264 return false;
1265
1266 bool Changed = false;
1267
1268 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1269 switch (Scope) {
1270 case SIAtomicScope::SYSTEM:
1271 case SIAtomicScope::AGENT:
1272 // Same as GFX7.
1273 break;
1274 case SIAtomicScope::WORKGROUP:
1275 // In threadgroup split mode the waves of a work-group can be executing on
1276 // different CUs. Therefore need to invalidate the L1 which is per CU.
1277 // Otherwise in non-threadgroup split mode all waves of a work-group are
1278 // on the same CU, and so the L1 does not need to be invalidated.
1279 if (ST.isTgSplitEnabled()) {
1280 // Same as GFX7 using agent scope.
1281 Scope = SIAtomicScope::AGENT;
1282 }
1283 break;
1284 case SIAtomicScope::WAVEFRONT:
1285 case SIAtomicScope::SINGLETHREAD:
1286 // Same as GFX7.
1287 break;
1288 default:
1289 llvm_unreachable("Unsupported synchronization scope");
1290 }
1291 }
1292
1293 /// The scratch address space does not need the global memory cache
1294 /// to be flushed as all memory operations by the same thread are
1295 /// sequentially consistent, and no other thread can access scratch
1296 /// memory.
1297
1298 /// Other address spaces do not have a cache.
1299
1300 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1301
1302 return Changed;
1303 }
1304
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1305 bool SIGfx10CacheControl::enableLoadCacheBypass(
1306 const MachineBasicBlock::iterator &MI,
1307 SIAtomicScope Scope,
1308 SIAtomicAddrSpace AddrSpace) const {
1309 assert(MI->mayLoad() && !MI->mayStore());
1310 bool Changed = false;
1311
1312 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1313 /// TODO Do not set glc for rmw atomic operations as they
1314 /// implicitly bypass the L0/L1 caches.
1315
1316 switch (Scope) {
1317 case SIAtomicScope::SYSTEM:
1318 case SIAtomicScope::AGENT:
1319 Changed |= enableGLCBit(MI);
1320 Changed |= enableDLCBit(MI);
1321 break;
1322 case SIAtomicScope::WORKGROUP:
1323 // In WGP mode the waves of a work-group can be executing on either CU of
1324 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1325 // CU mode all waves of a work-group are on the same CU, and so the L0
1326 // does not need to be bypassed.
1327 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1328 break;
1329 case SIAtomicScope::WAVEFRONT:
1330 case SIAtomicScope::SINGLETHREAD:
1331 // No cache to bypass.
1332 break;
1333 default:
1334 llvm_unreachable("Unsupported synchronization scope");
1335 }
1336 }
1337
1338 /// The scratch address space does not need the global memory caches
1339 /// to be bypassed as all memory operations by the same thread are
1340 /// sequentially consistent, and no other thread can access scratch
1341 /// memory.
1342
1343 /// Other address spaces do not have a cache.
1344
1345 return Changed;
1346 }
1347
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1348 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1349 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1350 bool IsVolatile, bool IsNonTemporal) const {
1351
1352 // Only handle load and store, not atomic read-modify-write insructions. The
1353 // latter use glc to indicate if the atomic returns a result and so must not
1354 // be used for cache control.
1355 assert(MI->mayLoad() ^ MI->mayStore());
1356
1357 // Only update load and store, not LLVM IR atomic read-modify-write
1358 // instructions. The latter are always marked as volatile so cannot sensibly
1359 // handle it as do not want to pessimize all atomics. Also they do not support
1360 // the nontemporal attribute.
1361 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1362
1363 bool Changed = false;
1364
1365 if (IsVolatile) {
1366
1367 if (Op == SIMemOp::LOAD) {
1368 Changed |= enableGLCBit(MI);
1369 Changed |= enableDLCBit(MI);
1370 }
1371
1372 // Ensure operation has completed at system scope to cause all volatile
1373 // operations to be visible outside the program in a global order. Do not
1374 // request cross address space as only the global address space can be
1375 // observable outside the program, so no need to cause a waitcnt for LDS
1376 // address space operations.
1377 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1378 Position::AFTER);
1379 return Changed;
1380 }
1381
1382 if (IsNonTemporal) {
1383 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1384 Changed |= enableSLCBit(MI);
1385 return Changed;
1386 }
1387
1388 return Changed;
1389 }
1390
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1391 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1392 SIAtomicScope Scope,
1393 SIAtomicAddrSpace AddrSpace,
1394 SIMemOp Op,
1395 bool IsCrossAddrSpaceOrdering,
1396 Position Pos) const {
1397 bool Changed = false;
1398
1399 MachineBasicBlock &MBB = *MI->getParent();
1400 DebugLoc DL = MI->getDebugLoc();
1401
1402 if (Pos == Position::AFTER)
1403 ++MI;
1404
1405 bool VMCnt = false;
1406 bool VSCnt = false;
1407 bool LGKMCnt = false;
1408
1409 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1410 SIAtomicAddrSpace::NONE) {
1411 switch (Scope) {
1412 case SIAtomicScope::SYSTEM:
1413 case SIAtomicScope::AGENT:
1414 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1415 VMCnt |= true;
1416 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1417 VSCnt |= true;
1418 break;
1419 case SIAtomicScope::WORKGROUP:
1420 // In WGP mode the waves of a work-group can be executing on either CU of
1421 // the WGP. Therefore need to wait for operations to complete to ensure
1422 // they are visible to waves in the other CU as the L0 is per CU.
1423 // Otherwise in CU mode and all waves of a work-group are on the same CU
1424 // which shares the same L0.
1425 if (!ST.isCuModeEnabled()) {
1426 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1427 VMCnt |= true;
1428 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1429 VSCnt |= true;
1430 }
1431 break;
1432 case SIAtomicScope::WAVEFRONT:
1433 case SIAtomicScope::SINGLETHREAD:
1434 // The L0 cache keeps all memory operations in order for
1435 // work-items in the same wavefront.
1436 break;
1437 default:
1438 llvm_unreachable("Unsupported synchronization scope");
1439 }
1440 }
1441
1442 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1443 switch (Scope) {
1444 case SIAtomicScope::SYSTEM:
1445 case SIAtomicScope::AGENT:
1446 case SIAtomicScope::WORKGROUP:
1447 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1448 // not needed as LDS operations for all waves are executed in a total
1449 // global ordering as observed by all waves. Required if also
1450 // synchronizing with global/GDS memory as LDS operations could be
1451 // reordered with respect to later global/GDS memory operations of the
1452 // same wave.
1453 LGKMCnt |= IsCrossAddrSpaceOrdering;
1454 break;
1455 case SIAtomicScope::WAVEFRONT:
1456 case SIAtomicScope::SINGLETHREAD:
1457 // The LDS keeps all memory operations in order for
1458 // the same wavesfront.
1459 break;
1460 default:
1461 llvm_unreachable("Unsupported synchronization scope");
1462 }
1463 }
1464
1465 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1466 switch (Scope) {
1467 case SIAtomicScope::SYSTEM:
1468 case SIAtomicScope::AGENT:
1469 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1470 // is not needed as GDS operations for all waves are executed in a total
1471 // global ordering as observed by all waves. Required if also
1472 // synchronizing with global/LDS memory as GDS operations could be
1473 // reordered with respect to later global/LDS memory operations of the
1474 // same wave.
1475 LGKMCnt |= IsCrossAddrSpaceOrdering;
1476 break;
1477 case SIAtomicScope::WORKGROUP:
1478 case SIAtomicScope::WAVEFRONT:
1479 case SIAtomicScope::SINGLETHREAD:
1480 // The GDS keeps all memory operations in order for
1481 // the same work-group.
1482 break;
1483 default:
1484 llvm_unreachable("Unsupported synchronization scope");
1485 }
1486 }
1487
1488 if (VMCnt || LGKMCnt) {
1489 unsigned WaitCntImmediate =
1490 AMDGPU::encodeWaitcnt(IV,
1491 VMCnt ? 0 : getVmcntBitMask(IV),
1492 getExpcntBitMask(IV),
1493 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1494 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1495 Changed = true;
1496 }
1497
1498 if (VSCnt) {
1499 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1500 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1501 .addImm(0);
1502 Changed = true;
1503 }
1504
1505 if (Pos == Position::AFTER)
1506 --MI;
1507
1508 return Changed;
1509 }
1510
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1511 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1512 SIAtomicScope Scope,
1513 SIAtomicAddrSpace AddrSpace,
1514 Position Pos) const {
1515 if (!InsertCacheInv)
1516 return false;
1517
1518 bool Changed = false;
1519
1520 MachineBasicBlock &MBB = *MI->getParent();
1521 DebugLoc DL = MI->getDebugLoc();
1522
1523 if (Pos == Position::AFTER)
1524 ++MI;
1525
1526 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1527 switch (Scope) {
1528 case SIAtomicScope::SYSTEM:
1529 case SIAtomicScope::AGENT:
1530 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1531 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1532 Changed = true;
1533 break;
1534 case SIAtomicScope::WORKGROUP:
1535 // In WGP mode the waves of a work-group can be executing on either CU of
1536 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1537 // in CU mode and all waves of a work-group are on the same CU, and so the
1538 // L0 does not need to be invalidated.
1539 if (!ST.isCuModeEnabled()) {
1540 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1541 Changed = true;
1542 }
1543 break;
1544 case SIAtomicScope::WAVEFRONT:
1545 case SIAtomicScope::SINGLETHREAD:
1546 // No cache to invalidate.
1547 break;
1548 default:
1549 llvm_unreachable("Unsupported synchronization scope");
1550 }
1551 }
1552
1553 /// The scratch address space does not need the global memory cache
1554 /// to be flushed as all memory operations by the same thread are
1555 /// sequentially consistent, and no other thread can access scratch
1556 /// memory.
1557
1558 /// Other address spaces do not have a cache.
1559
1560 if (Pos == Position::AFTER)
1561 --MI;
1562
1563 return Changed;
1564 }
1565
removeAtomicPseudoMIs()1566 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1567 if (AtomicPseudoMIs.empty())
1568 return false;
1569
1570 for (auto &MI : AtomicPseudoMIs)
1571 MI->eraseFromParent();
1572
1573 AtomicPseudoMIs.clear();
1574 return true;
1575 }
1576
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1577 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1578 MachineBasicBlock::iterator &MI) {
1579 assert(MI->mayLoad() && !MI->mayStore());
1580
1581 bool Changed = false;
1582
1583 if (MOI.isAtomic()) {
1584 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1585 MOI.getOrdering() == AtomicOrdering::Acquire ||
1586 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1587 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1588 MOI.getOrderingAddrSpace());
1589 }
1590
1591 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1592 Changed |= CC->insertWait(MI, MOI.getScope(),
1593 MOI.getOrderingAddrSpace(),
1594 SIMemOp::LOAD | SIMemOp::STORE,
1595 MOI.getIsCrossAddressSpaceOrdering(),
1596 Position::BEFORE);
1597
1598 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1599 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1600 Changed |= CC->insertWait(MI, MOI.getScope(),
1601 MOI.getInstrAddrSpace(),
1602 SIMemOp::LOAD,
1603 MOI.getIsCrossAddressSpaceOrdering(),
1604 Position::AFTER);
1605 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1606 MOI.getOrderingAddrSpace(),
1607 Position::AFTER);
1608 }
1609
1610 return Changed;
1611 }
1612
1613 // Atomic instructions already bypass caches to the scope specified by the
1614 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1615 // need additional treatment.
1616 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1617 SIMemOp::LOAD, MOI.isVolatile(),
1618 MOI.isNonTemporal());
1619 return Changed;
1620 }
1621
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1622 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1623 MachineBasicBlock::iterator &MI) {
1624 assert(!MI->mayLoad() && MI->mayStore());
1625
1626 bool Changed = false;
1627
1628 if (MOI.isAtomic()) {
1629 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1630 MOI.getOrdering() == AtomicOrdering::Release ||
1631 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1632 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1633 MOI.getOrderingAddrSpace());
1634 }
1635
1636 if (MOI.getOrdering() == AtomicOrdering::Release ||
1637 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1638 Changed |= CC->insertRelease(MI, MOI.getScope(),
1639 MOI.getOrderingAddrSpace(),
1640 MOI.getIsCrossAddressSpaceOrdering(),
1641 Position::BEFORE);
1642
1643 return Changed;
1644 }
1645
1646 // Atomic instructions already bypass caches to the scope specified by the
1647 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1648 // need additional treatment.
1649 Changed |= CC->enableVolatileAndOrNonTemporal(
1650 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1651 MOI.isNonTemporal());
1652 return Changed;
1653 }
1654
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1655 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1656 MachineBasicBlock::iterator &MI) {
1657 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1658
1659 AtomicPseudoMIs.push_back(MI);
1660 bool Changed = false;
1661
1662 if (MOI.isAtomic()) {
1663 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1664 MOI.getOrdering() == AtomicOrdering::Release ||
1665 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1666 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1667 /// TODO: This relies on a barrier always generating a waitcnt
1668 /// for LDS to ensure it is not reordered with the completion of
1669 /// the proceeding LDS operations. If barrier had a memory
1670 /// ordering and memory scope, then library does not need to
1671 /// generate a fence. Could add support in this file for
1672 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1673 /// adding S_WAITCNT before a S_BARRIER.
1674 Changed |= CC->insertRelease(MI, MOI.getScope(),
1675 MOI.getOrderingAddrSpace(),
1676 MOI.getIsCrossAddressSpaceOrdering(),
1677 Position::BEFORE);
1678
1679 // TODO: If both release and invalidate are happening they could be combined
1680 // to use the single "BUFFER_WBINV*" instruction. This could be done by
1681 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1682 // track cache invalidate and write back instructions.
1683
1684 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1685 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1686 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1687 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1688 MOI.getOrderingAddrSpace(),
1689 Position::BEFORE);
1690
1691 return Changed;
1692 }
1693
1694 return Changed;
1695 }
1696
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1697 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1698 MachineBasicBlock::iterator &MI) {
1699 assert(MI->mayLoad() && MI->mayStore());
1700
1701 bool Changed = false;
1702
1703 if (MOI.isAtomic()) {
1704 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1705 MOI.getOrdering() == AtomicOrdering::Acquire ||
1706 MOI.getOrdering() == AtomicOrdering::Release ||
1707 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1708 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1709 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1710 MOI.getInstrAddrSpace());
1711 }
1712
1713 if (MOI.getOrdering() == AtomicOrdering::Release ||
1714 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1715 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1716 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1717 Changed |= CC->insertRelease(MI, MOI.getScope(),
1718 MOI.getOrderingAddrSpace(),
1719 MOI.getIsCrossAddressSpaceOrdering(),
1720 Position::BEFORE);
1721
1722 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1723 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1724 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1725 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1726 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1727 Changed |= CC->insertWait(MI, MOI.getScope(),
1728 MOI.getInstrAddrSpace(),
1729 isAtomicRet(*MI) ? SIMemOp::LOAD :
1730 SIMemOp::STORE,
1731 MOI.getIsCrossAddressSpaceOrdering(),
1732 Position::AFTER);
1733 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1734 MOI.getOrderingAddrSpace(),
1735 Position::AFTER);
1736 }
1737
1738 return Changed;
1739 }
1740
1741 return Changed;
1742 }
1743
runOnMachineFunction(MachineFunction & MF)1744 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1745 bool Changed = false;
1746
1747 SIMemOpAccess MOA(MF);
1748 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1749
1750 for (auto &MBB : MF) {
1751 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1752
1753 // Unbundle instructions after the post-RA scheduler.
1754 if (MI->isBundle() && MI->mayLoadOrStore()) {
1755 MachineBasicBlock::instr_iterator II(MI->getIterator());
1756 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1757 I != E && I->isBundledWithPred(); ++I) {
1758 I->unbundleFromPred();
1759 for (MachineOperand &MO : I->operands())
1760 if (MO.isReg())
1761 MO.setIsInternalRead(false);
1762 }
1763
1764 MI->eraseFromParent();
1765 MI = II->getIterator();
1766 }
1767
1768 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1769 continue;
1770
1771 if (const auto &MOI = MOA.getLoadInfo(MI))
1772 Changed |= expandLoad(MOI.getValue(), MI);
1773 else if (const auto &MOI = MOA.getStoreInfo(MI))
1774 Changed |= expandStore(MOI.getValue(), MI);
1775 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1776 Changed |= expandAtomicFence(MOI.getValue(), MI);
1777 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1778 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1779 }
1780 }
1781
1782 Changed |= removeAtomicPseudoMIs();
1783 return Changed;
1784 }
1785
1786 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1787
1788 char SIMemoryLegalizer::ID = 0;
1789 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1790
createSIMemoryLegalizerPass()1791 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1792 return new SIMemoryLegalizer();
1793 }
1794