1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34 cl::desc("Use this to skip inserting cache invalidating instructions."));
35
36 namespace {
37
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42 NONE = 0u,
43 LOAD = 1u << 0,
44 STORE = 1u << 1,
45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51 BEFORE,
52 AFTER
53 };
54
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57 NONE,
58 SINGLETHREAD,
59 WAVEFRONT,
60 WORKGROUP,
61 AGENT,
62 SYSTEM
63 };
64
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68 NONE = 0u,
69 GLOBAL = 1u << 0,
70 LDS = 1u << 1,
71 SCRATCH = 1u << 2,
72 GDS = 1u << 3,
73 OTHER = 1u << 4,
74
75 /// The address spaces that can be accessed by a FLAT instruction.
76 FLAT = GLOBAL | LDS | SCRATCH,
77
78 /// The address spaces that support atomic instructions.
79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80
81 /// All address spaces.
82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83
84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86
87 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
88 /// \returns Returns true if \p MI is modified, false otherwise.
89 template <uint16_t BitName>
enableNamedBit(const MachineBasicBlock::iterator & MI)90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
91 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
92 if (BitIdx == -1)
93 return false;
94
95 MachineOperand &Bit = MI->getOperand(BitIdx);
96 if (Bit.getImm() != 0)
97 return false;
98
99 Bit.setImm(1);
100 return true;
101 }
102
103 class SIMemOpInfo final {
104 private:
105
106 friend class SIMemOpAccess;
107
108 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
109 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
110 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
111 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
112 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
113 bool IsCrossAddressSpaceOrdering = false;
114 bool IsVolatile = false;
115 bool IsNonTemporal = false;
116
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)117 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
118 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
119 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
120 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
121 bool IsCrossAddressSpaceOrdering = true,
122 AtomicOrdering FailureOrdering =
123 AtomicOrdering::SequentiallyConsistent,
124 bool IsVolatile = false,
125 bool IsNonTemporal = false)
126 : Ordering(Ordering), FailureOrdering(FailureOrdering),
127 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
128 InstrAddrSpace(InstrAddrSpace),
129 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
130 IsVolatile(IsVolatile),
131 IsNonTemporal(IsNonTemporal) {
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138 }
139
140 public:
141 /// \returns Atomic synchronization scope of the machine instruction used to
142 /// create this SIMemOpInfo.
getScope() const143 SIAtomicScope getScope() const {
144 return Scope;
145 }
146
147 /// \returns Ordering constraint of the machine instruction used to
148 /// create this SIMemOpInfo.
getOrdering() const149 AtomicOrdering getOrdering() const {
150 return Ordering;
151 }
152
153 /// \returns Failure ordering constraint of the machine instruction used to
154 /// create this SIMemOpInfo.
getFailureOrdering() const155 AtomicOrdering getFailureOrdering() const {
156 return FailureOrdering;
157 }
158
159 /// \returns The address spaces be accessed by the machine
160 /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const161 SIAtomicAddrSpace getInstrAddrSpace() const {
162 return InstrAddrSpace;
163 }
164
165 /// \returns The address spaces that must be ordered by the machine
166 /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const167 SIAtomicAddrSpace getOrderingAddrSpace() const {
168 return OrderingAddrSpace;
169 }
170
171 /// \returns Return true iff memory ordering of operations on
172 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const173 bool getIsCrossAddressSpaceOrdering() const {
174 return IsCrossAddressSpaceOrdering;
175 }
176
177 /// \returns True if memory access of the machine instruction used to
178 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const179 bool isVolatile() const {
180 return IsVolatile;
181 }
182
183 /// \returns True if memory access of the machine instruction used to
184 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const185 bool isNonTemporal() const {
186 return IsNonTemporal;
187 }
188
189 /// \returns True if ordering constraint of the machine instruction used to
190 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const191 bool isAtomic() const {
192 return Ordering != AtomicOrdering::NotAtomic;
193 }
194
195 };
196
197 class SIMemOpAccess final {
198 private:
199 AMDGPUMachineModuleInfo *MMI = nullptr;
200
201 /// Reports unsupported message \p Msg for \p MI to LLVM context.
202 void reportUnsupported(const MachineBasicBlock::iterator &MI,
203 const char *Msg) const;
204
205 /// Inspects the target synchonization scope \p SSID and determines
206 /// the SI atomic scope it corresponds to, the address spaces it
207 /// covers, and whether the memory ordering applies between address
208 /// spaces.
209 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
210 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
211
212 /// \return Return a bit set of the address spaces accessed by \p AS.
213 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
214
215 /// \returns Info constructed from \p MI, which has at least machine memory
216 /// operand.
217 Optional<SIMemOpInfo> constructFromMIWithMMO(
218 const MachineBasicBlock::iterator &MI) const;
219
220 public:
221 /// Construct class to support accessing the machine memory operands
222 /// of instructions in the machine function \p MF.
223 SIMemOpAccess(MachineFunction &MF);
224
225 /// \returns Load info if \p MI is a load operation, "None" otherwise.
226 Optional<SIMemOpInfo> getLoadInfo(
227 const MachineBasicBlock::iterator &MI) const;
228
229 /// \returns Store info if \p MI is a store operation, "None" otherwise.
230 Optional<SIMemOpInfo> getStoreInfo(
231 const MachineBasicBlock::iterator &MI) const;
232
233 /// \returns Atomic fence info if \p MI is an atomic fence operation,
234 /// "None" otherwise.
235 Optional<SIMemOpInfo> getAtomicFenceInfo(
236 const MachineBasicBlock::iterator &MI) const;
237
238 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
239 /// rmw operation, "None" otherwise.
240 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
241 const MachineBasicBlock::iterator &MI) const;
242 };
243
244 class SICacheControl {
245 protected:
246
247 /// AMDGPU subtarget info.
248 const GCNSubtarget &ST;
249
250 /// Instruction info.
251 const SIInstrInfo *TII = nullptr;
252
253 IsaVersion IV;
254
255 /// Whether to insert cache invalidating instructions.
256 bool InsertCacheInv;
257
258 SICacheControl(const GCNSubtarget &ST);
259
260 public:
261
262 /// Create a cache control for the subtarget \p ST.
263 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
264
265 /// Update \p MI memory load instruction to bypass any caches up to
266 /// the \p Scope memory scope for address spaces \p
267 /// AddrSpace. Return true iff the instruction was modified.
268 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
269 SIAtomicScope Scope,
270 SIAtomicAddrSpace AddrSpace) const = 0;
271
272 /// Update \p MI memory instruction of kind \p Op associated with address
273 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
274 /// true iff the instruction was modified.
275 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
276 SIAtomicAddrSpace AddrSpace,
277 SIMemOp Op, bool IsVolatile,
278 bool IsNonTemporal) const = 0;
279
280 /// Inserts any necessary instructions at position \p Pos relative
281 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
282 /// \p Op associated with address spaces \p AddrSpace have completed. Used
283 /// between memory instructions to enforce the order they become visible as
284 /// observed by other memory instructions executing in memory scope \p Scope.
285 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
286 /// address spaces. Returns true iff any instructions inserted.
287 virtual bool insertWait(MachineBasicBlock::iterator &MI,
288 SIAtomicScope Scope,
289 SIAtomicAddrSpace AddrSpace,
290 SIMemOp Op,
291 bool IsCrossAddrSpaceOrdering,
292 Position Pos) const = 0;
293
294 /// Inserts any necessary instructions at position \p Pos relative to
295 /// instruction \p MI to ensure any subsequent memory instructions of this
296 /// thread with address spaces \p AddrSpace will observe the previous memory
297 /// operations by any thread for memory scopes up to memory scope \p Scope .
298 /// Returns true iff any instructions inserted.
299 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
300 SIAtomicScope Scope,
301 SIAtomicAddrSpace AddrSpace,
302 Position Pos) const = 0;
303
304 /// Inserts any necessary instructions at position \p Pos relative to
305 /// instruction \p MI to ensure previous memory instructions by this thread
306 /// with address spaces \p AddrSpace have completed and can be observed by
307 /// subsequent memory instructions by any thread executing in memory scope \p
308 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
309 /// between address spaces. Returns true iff any instructions inserted.
310 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
311 SIAtomicScope Scope,
312 SIAtomicAddrSpace AddrSpace,
313 bool IsCrossAddrSpaceOrdering,
314 Position Pos) const = 0;
315
316 /// Virtual destructor to allow derivations to be deleted.
317 virtual ~SICacheControl() = default;
318
319 };
320
321 class SIGfx6CacheControl : public SICacheControl {
322 protected:
323
324 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
325 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const326 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
327 return enableNamedBit<AMDGPU::OpName::glc>(MI);
328 }
329
330 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
331 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const332 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
333 return enableNamedBit<AMDGPU::OpName::slc>(MI);
334 }
335
336 public:
337
SIGfx6CacheControl(const GCNSubtarget & ST)338 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
339
340 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
341 SIAtomicScope Scope,
342 SIAtomicAddrSpace AddrSpace) const override;
343
344 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
346 bool IsVolatile,
347 bool IsNonTemporal) const override;
348
349 bool insertWait(MachineBasicBlock::iterator &MI,
350 SIAtomicScope Scope,
351 SIAtomicAddrSpace AddrSpace,
352 SIMemOp Op,
353 bool IsCrossAddrSpaceOrdering,
354 Position Pos) const override;
355
356 bool insertAcquire(MachineBasicBlock::iterator &MI,
357 SIAtomicScope Scope,
358 SIAtomicAddrSpace AddrSpace,
359 Position Pos) const override;
360
361 bool insertRelease(MachineBasicBlock::iterator &MI,
362 SIAtomicScope Scope,
363 SIAtomicAddrSpace AddrSpace,
364 bool IsCrossAddrSpaceOrdering,
365 Position Pos) const override;
366 };
367
368 class SIGfx7CacheControl : public SIGfx6CacheControl {
369 public:
370
SIGfx7CacheControl(const GCNSubtarget & ST)371 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
372
373 bool insertAcquire(MachineBasicBlock::iterator &MI,
374 SIAtomicScope Scope,
375 SIAtomicAddrSpace AddrSpace,
376 Position Pos) const override;
377
378 };
379
380 class SIGfx10CacheControl : public SIGfx7CacheControl {
381 protected:
382
383 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
384 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const385 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
386 return enableNamedBit<AMDGPU::OpName::dlc>(MI);
387 }
388
389 public:
390
SIGfx10CacheControl(const GCNSubtarget & ST)391 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
392
393 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
394 SIAtomicScope Scope,
395 SIAtomicAddrSpace AddrSpace) const override;
396
397 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399 bool IsVolatile,
400 bool IsNonTemporal) const override;
401
402 bool insertWait(MachineBasicBlock::iterator &MI,
403 SIAtomicScope Scope,
404 SIAtomicAddrSpace AddrSpace,
405 SIMemOp Op,
406 bool IsCrossAddrSpaceOrdering,
407 Position Pos) const override;
408
409 bool insertAcquire(MachineBasicBlock::iterator &MI,
410 SIAtomicScope Scope,
411 SIAtomicAddrSpace AddrSpace,
412 Position Pos) const override;
413 };
414
415 class SIMemoryLegalizer final : public MachineFunctionPass {
416 private:
417
418 /// Cache Control.
419 std::unique_ptr<SICacheControl> CC = nullptr;
420
421 /// List of atomic pseudo instructions.
422 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
423
424 /// Return true iff instruction \p MI is a atomic instruction that
425 /// returns a result.
isAtomicRet(const MachineInstr & MI) const426 bool isAtomicRet(const MachineInstr &MI) const {
427 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
428 }
429
430 /// Removes all processed atomic pseudo instructions from the current
431 /// function. Returns true if current function is modified, false otherwise.
432 bool removeAtomicPseudoMIs();
433
434 /// Expands load operation \p MI. Returns true if instructions are
435 /// added/deleted or \p MI is modified, false otherwise.
436 bool expandLoad(const SIMemOpInfo &MOI,
437 MachineBasicBlock::iterator &MI);
438 /// Expands store operation \p MI. Returns true if instructions are
439 /// added/deleted or \p MI is modified, false otherwise.
440 bool expandStore(const SIMemOpInfo &MOI,
441 MachineBasicBlock::iterator &MI);
442 /// Expands atomic fence operation \p MI. Returns true if
443 /// instructions are added/deleted or \p MI is modified, false otherwise.
444 bool expandAtomicFence(const SIMemOpInfo &MOI,
445 MachineBasicBlock::iterator &MI);
446 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
447 /// instructions are added/deleted or \p MI is modified, false otherwise.
448 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
449 MachineBasicBlock::iterator &MI);
450
451 public:
452 static char ID;
453
SIMemoryLegalizer()454 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
455
getAnalysisUsage(AnalysisUsage & AU) const456 void getAnalysisUsage(AnalysisUsage &AU) const override {
457 AU.setPreservesCFG();
458 MachineFunctionPass::getAnalysisUsage(AU);
459 }
460
getPassName() const461 StringRef getPassName() const override {
462 return PASS_NAME;
463 }
464
465 bool runOnMachineFunction(MachineFunction &MF) override;
466 };
467
468 } // end namespace anonymous
469
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const470 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
471 const char *Msg) const {
472 const Function &Func = MI->getParent()->getParent()->getFunction();
473 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
474 Func.getContext().diagnose(Diag);
475 }
476
477 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrScope) const478 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
479 SIAtomicAddrSpace InstrScope) const {
480 if (SSID == SyncScope::System)
481 return std::make_tuple(SIAtomicScope::SYSTEM,
482 SIAtomicAddrSpace::ATOMIC,
483 true);
484 if (SSID == MMI->getAgentSSID())
485 return std::make_tuple(SIAtomicScope::AGENT,
486 SIAtomicAddrSpace::ATOMIC,
487 true);
488 if (SSID == MMI->getWorkgroupSSID())
489 return std::make_tuple(SIAtomicScope::WORKGROUP,
490 SIAtomicAddrSpace::ATOMIC,
491 true);
492 if (SSID == MMI->getWavefrontSSID())
493 return std::make_tuple(SIAtomicScope::WAVEFRONT,
494 SIAtomicAddrSpace::ATOMIC,
495 true);
496 if (SSID == SyncScope::SingleThread)
497 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
498 SIAtomicAddrSpace::ATOMIC,
499 true);
500 if (SSID == MMI->getSystemOneAddressSpaceSSID())
501 return std::make_tuple(SIAtomicScope::SYSTEM,
502 SIAtomicAddrSpace::ATOMIC & InstrScope,
503 false);
504 if (SSID == MMI->getAgentOneAddressSpaceSSID())
505 return std::make_tuple(SIAtomicScope::AGENT,
506 SIAtomicAddrSpace::ATOMIC & InstrScope,
507 false);
508 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
509 return std::make_tuple(SIAtomicScope::WORKGROUP,
510 SIAtomicAddrSpace::ATOMIC & InstrScope,
511 false);
512 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
513 return std::make_tuple(SIAtomicScope::WAVEFRONT,
514 SIAtomicAddrSpace::ATOMIC & InstrScope,
515 false);
516 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
517 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
518 SIAtomicAddrSpace::ATOMIC & InstrScope,
519 false);
520 return None;
521 }
522
toSIAtomicAddrSpace(unsigned AS) const523 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
524 if (AS == AMDGPUAS::FLAT_ADDRESS)
525 return SIAtomicAddrSpace::FLAT;
526 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
527 return SIAtomicAddrSpace::GLOBAL;
528 if (AS == AMDGPUAS::LOCAL_ADDRESS)
529 return SIAtomicAddrSpace::LDS;
530 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
531 return SIAtomicAddrSpace::SCRATCH;
532 if (AS == AMDGPUAS::REGION_ADDRESS)
533 return SIAtomicAddrSpace::GDS;
534
535 return SIAtomicAddrSpace::OTHER;
536 }
537
SIMemOpAccess(MachineFunction & MF)538 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
539 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
540 }
541
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const542 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
543 const MachineBasicBlock::iterator &MI) const {
544 assert(MI->getNumMemOperands() > 0);
545
546 SyncScope::ID SSID = SyncScope::SingleThread;
547 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
548 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
549 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
550 bool IsNonTemporal = true;
551 bool IsVolatile = false;
552
553 // Validator should check whether or not MMOs cover the entire set of
554 // locations accessed by the memory instruction.
555 for (const auto &MMO : MI->memoperands()) {
556 IsNonTemporal &= MMO->isNonTemporal();
557 IsVolatile |= MMO->isVolatile();
558 InstrAddrSpace |=
559 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
560 AtomicOrdering OpOrdering = MMO->getOrdering();
561 if (OpOrdering != AtomicOrdering::NotAtomic) {
562 const auto &IsSyncScopeInclusion =
563 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
564 if (!IsSyncScopeInclusion) {
565 reportUnsupported(MI,
566 "Unsupported non-inclusive atomic synchronization scope");
567 return None;
568 }
569
570 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
571 Ordering =
572 isStrongerThan(Ordering, OpOrdering) ?
573 Ordering : MMO->getOrdering();
574 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
575 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
576 FailureOrdering =
577 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
578 FailureOrdering : MMO->getFailureOrdering();
579 }
580 }
581
582 SIAtomicScope Scope = SIAtomicScope::NONE;
583 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
584 bool IsCrossAddressSpaceOrdering = false;
585 if (Ordering != AtomicOrdering::NotAtomic) {
586 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
587 if (!ScopeOrNone) {
588 reportUnsupported(MI, "Unsupported atomic synchronization scope");
589 return None;
590 }
591 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
592 ScopeOrNone.getValue();
593 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
594 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
595 reportUnsupported(MI, "Unsupported atomic address space");
596 return None;
597 }
598 }
599 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
600 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
601 IsNonTemporal);
602 }
603
getLoadInfo(const MachineBasicBlock::iterator & MI) const604 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
605 const MachineBasicBlock::iterator &MI) const {
606 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
607
608 if (!(MI->mayLoad() && !MI->mayStore()))
609 return None;
610
611 // Be conservative if there are no memory operands.
612 if (MI->getNumMemOperands() == 0)
613 return SIMemOpInfo();
614
615 return constructFromMIWithMMO(MI);
616 }
617
getStoreInfo(const MachineBasicBlock::iterator & MI) const618 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
619 const MachineBasicBlock::iterator &MI) const {
620 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
621
622 if (!(!MI->mayLoad() && MI->mayStore()))
623 return None;
624
625 // Be conservative if there are no memory operands.
626 if (MI->getNumMemOperands() == 0)
627 return SIMemOpInfo();
628
629 return constructFromMIWithMMO(MI);
630 }
631
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const632 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
633 const MachineBasicBlock::iterator &MI) const {
634 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
635
636 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
637 return None;
638
639 AtomicOrdering Ordering =
640 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
641
642 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
643 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
644 if (!ScopeOrNone) {
645 reportUnsupported(MI, "Unsupported atomic synchronization scope");
646 return None;
647 }
648
649 SIAtomicScope Scope = SIAtomicScope::NONE;
650 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
651 bool IsCrossAddressSpaceOrdering = false;
652 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
653 ScopeOrNone.getValue();
654
655 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
656 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
657 reportUnsupported(MI, "Unsupported atomic address space");
658 return None;
659 }
660
661 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
662 IsCrossAddressSpaceOrdering);
663 }
664
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const665 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
666 const MachineBasicBlock::iterator &MI) const {
667 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
668
669 if (!(MI->mayLoad() && MI->mayStore()))
670 return None;
671
672 // Be conservative if there are no memory operands.
673 if (MI->getNumMemOperands() == 0)
674 return SIMemOpInfo();
675
676 return constructFromMIWithMMO(MI);
677 }
678
SICacheControl(const GCNSubtarget & ST)679 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
680 TII = ST.getInstrInfo();
681 IV = getIsaVersion(ST.getCPU());
682 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
683 }
684
685 /* static */
create(const GCNSubtarget & ST)686 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
687 GCNSubtarget::Generation Generation = ST.getGeneration();
688 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
689 return std::make_unique<SIGfx6CacheControl>(ST);
690 if (Generation < AMDGPUSubtarget::GFX10)
691 return std::make_unique<SIGfx7CacheControl>(ST);
692 return std::make_unique<SIGfx10CacheControl>(ST);
693 }
694
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const695 bool SIGfx6CacheControl::enableLoadCacheBypass(
696 const MachineBasicBlock::iterator &MI,
697 SIAtomicScope Scope,
698 SIAtomicAddrSpace AddrSpace) const {
699 assert(MI->mayLoad() && !MI->mayStore());
700 bool Changed = false;
701
702 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
703 switch (Scope) {
704 case SIAtomicScope::SYSTEM:
705 case SIAtomicScope::AGENT:
706 Changed |= enableGLCBit(MI);
707 break;
708 case SIAtomicScope::WORKGROUP:
709 case SIAtomicScope::WAVEFRONT:
710 case SIAtomicScope::SINGLETHREAD:
711 // No cache to bypass.
712 break;
713 default:
714 llvm_unreachable("Unsupported synchronization scope");
715 }
716 }
717
718 /// The scratch address space does not need the global memory caches
719 /// to be bypassed as all memory operations by the same thread are
720 /// sequentially consistent, and no other thread can access scratch
721 /// memory.
722
723 /// Other address spaces do not have a cache.
724
725 return Changed;
726 }
727
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const728 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
729 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
730 bool IsVolatile, bool IsNonTemporal) const {
731 // Only handle load and store, not atomic read-modify-write insructions. The
732 // latter use glc to indicate if the atomic returns a result and so must not
733 // be used for cache control.
734 assert(MI->mayLoad() ^ MI->mayStore());
735
736 // Only update load and store, not LLVM IR atomic read-modify-write
737 // instructions. The latter are always marked as volatile so cannot sensibly
738 // handle it as do not want to pessimize all atomics. Also they do not support
739 // the nontemporal attribute.
740 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
741
742 bool Changed = false;
743
744 if (IsVolatile) {
745 if (Op == SIMemOp::LOAD)
746 Changed |= enableGLCBit(MI);
747
748 // Ensure operation has completed at system scope to cause all volatile
749 // operations to be visible outside the program in a global order. Do not
750 // request cross address space as only the global address space can be
751 // observable outside the program, so no need to cause a waitcnt for LDS
752 // address space operations.
753 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
754 Position::AFTER);
755
756 return Changed;
757 }
758
759 if (IsNonTemporal) {
760 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
761 Changed |= enableGLCBit(MI);
762 Changed |= enableSLCBit(MI);
763 return Changed;
764 }
765
766 return Changed;
767 }
768
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const769 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
770 SIAtomicScope Scope,
771 SIAtomicAddrSpace AddrSpace,
772 SIMemOp Op,
773 bool IsCrossAddrSpaceOrdering,
774 Position Pos) const {
775 bool Changed = false;
776
777 MachineBasicBlock &MBB = *MI->getParent();
778 DebugLoc DL = MI->getDebugLoc();
779
780 if (Pos == Position::AFTER)
781 ++MI;
782
783 bool VMCnt = false;
784 bool LGKMCnt = false;
785
786 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
787 SIAtomicAddrSpace::NONE) {
788 switch (Scope) {
789 case SIAtomicScope::SYSTEM:
790 case SIAtomicScope::AGENT:
791 VMCnt |= true;
792 break;
793 case SIAtomicScope::WORKGROUP:
794 case SIAtomicScope::WAVEFRONT:
795 case SIAtomicScope::SINGLETHREAD:
796 // The L1 cache keeps all memory operations in order for
797 // wavefronts in the same work-group.
798 break;
799 default:
800 llvm_unreachable("Unsupported synchronization scope");
801 }
802 }
803
804 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
805 switch (Scope) {
806 case SIAtomicScope::SYSTEM:
807 case SIAtomicScope::AGENT:
808 case SIAtomicScope::WORKGROUP:
809 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
810 // not needed as LDS operations for all waves are executed in a total
811 // global ordering as observed by all waves. Required if also
812 // synchronizing with global/GDS memory as LDS operations could be
813 // reordered with respect to later global/GDS memory operations of the
814 // same wave.
815 LGKMCnt |= IsCrossAddrSpaceOrdering;
816 break;
817 case SIAtomicScope::WAVEFRONT:
818 case SIAtomicScope::SINGLETHREAD:
819 // The LDS keeps all memory operations in order for
820 // the same wavesfront.
821 break;
822 default:
823 llvm_unreachable("Unsupported synchronization scope");
824 }
825 }
826
827 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
828 switch (Scope) {
829 case SIAtomicScope::SYSTEM:
830 case SIAtomicScope::AGENT:
831 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
832 // is not needed as GDS operations for all waves are executed in a total
833 // global ordering as observed by all waves. Required if also
834 // synchronizing with global/LDS memory as GDS operations could be
835 // reordered with respect to later global/LDS memory operations of the
836 // same wave.
837 LGKMCnt |= IsCrossAddrSpaceOrdering;
838 break;
839 case SIAtomicScope::WORKGROUP:
840 case SIAtomicScope::WAVEFRONT:
841 case SIAtomicScope::SINGLETHREAD:
842 // The GDS keeps all memory operations in order for
843 // the same work-group.
844 break;
845 default:
846 llvm_unreachable("Unsupported synchronization scope");
847 }
848 }
849
850 if (VMCnt || LGKMCnt) {
851 unsigned WaitCntImmediate =
852 AMDGPU::encodeWaitcnt(IV,
853 VMCnt ? 0 : getVmcntBitMask(IV),
854 getExpcntBitMask(IV),
855 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
856 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
857 Changed = true;
858 }
859
860 if (Pos == Position::AFTER)
861 --MI;
862
863 return Changed;
864 }
865
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const866 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
867 SIAtomicScope Scope,
868 SIAtomicAddrSpace AddrSpace,
869 Position Pos) const {
870 if (!InsertCacheInv)
871 return false;
872
873 bool Changed = false;
874
875 MachineBasicBlock &MBB = *MI->getParent();
876 DebugLoc DL = MI->getDebugLoc();
877
878 if (Pos == Position::AFTER)
879 ++MI;
880
881 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
882 switch (Scope) {
883 case SIAtomicScope::SYSTEM:
884 case SIAtomicScope::AGENT:
885 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
886 Changed = true;
887 break;
888 case SIAtomicScope::WORKGROUP:
889 case SIAtomicScope::WAVEFRONT:
890 case SIAtomicScope::SINGLETHREAD:
891 // No cache to invalidate.
892 break;
893 default:
894 llvm_unreachable("Unsupported synchronization scope");
895 }
896 }
897
898 /// The scratch address space does not need the global memory cache
899 /// to be flushed as all memory operations by the same thread are
900 /// sequentially consistent, and no other thread can access scratch
901 /// memory.
902
903 /// Other address spaces do not have a cache.
904
905 if (Pos == Position::AFTER)
906 --MI;
907
908 return Changed;
909 }
910
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const911 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
912 SIAtomicScope Scope,
913 SIAtomicAddrSpace AddrSpace,
914 bool IsCrossAddrSpaceOrdering,
915 Position Pos) const {
916 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
917 IsCrossAddrSpaceOrdering, Pos);
918 }
919
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const920 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
921 SIAtomicScope Scope,
922 SIAtomicAddrSpace AddrSpace,
923 Position Pos) const {
924 if (!InsertCacheInv)
925 return false;
926
927 bool Changed = false;
928
929 MachineBasicBlock &MBB = *MI->getParent();
930 DebugLoc DL = MI->getDebugLoc();
931
932 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
933
934 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
935 ? AMDGPU::BUFFER_WBINVL1
936 : AMDGPU::BUFFER_WBINVL1_VOL;
937
938 if (Pos == Position::AFTER)
939 ++MI;
940
941 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
942 switch (Scope) {
943 case SIAtomicScope::SYSTEM:
944 case SIAtomicScope::AGENT:
945 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
946 Changed = true;
947 break;
948 case SIAtomicScope::WORKGROUP:
949 case SIAtomicScope::WAVEFRONT:
950 case SIAtomicScope::SINGLETHREAD:
951 // No cache to invalidate.
952 break;
953 default:
954 llvm_unreachable("Unsupported synchronization scope");
955 }
956 }
957
958 /// The scratch address space does not need the global memory cache
959 /// to be flushed as all memory operations by the same thread are
960 /// sequentially consistent, and no other thread can access scratch
961 /// memory.
962
963 /// Other address spaces do not have a cache.
964
965 if (Pos == Position::AFTER)
966 --MI;
967
968 return Changed;
969 }
970
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const971 bool SIGfx10CacheControl::enableLoadCacheBypass(
972 const MachineBasicBlock::iterator &MI,
973 SIAtomicScope Scope,
974 SIAtomicAddrSpace AddrSpace) const {
975 assert(MI->mayLoad() && !MI->mayStore());
976 bool Changed = false;
977
978 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
979 /// TODO Do not set glc for rmw atomic operations as they
980 /// implicitly bypass the L0/L1 caches.
981
982 switch (Scope) {
983 case SIAtomicScope::SYSTEM:
984 case SIAtomicScope::AGENT:
985 Changed |= enableGLCBit(MI);
986 Changed |= enableDLCBit(MI);
987 break;
988 case SIAtomicScope::WORKGROUP:
989 // In WGP mode the waves of a work-group can be executing on either CU of
990 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
991 // CU mode all waves of a work-group are on the same CU, and so the L0
992 // does not need to be bypassed.
993 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
994 break;
995 case SIAtomicScope::WAVEFRONT:
996 case SIAtomicScope::SINGLETHREAD:
997 // No cache to bypass.
998 break;
999 default:
1000 llvm_unreachable("Unsupported synchronization scope");
1001 }
1002 }
1003
1004 /// The scratch address space does not need the global memory caches
1005 /// to be bypassed as all memory operations by the same thread are
1006 /// sequentially consistent, and no other thread can access scratch
1007 /// memory.
1008
1009 /// Other address spaces do not have a cache.
1010
1011 return Changed;
1012 }
1013
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1014 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1015 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1016 bool IsVolatile, bool IsNonTemporal) const {
1017
1018 // Only handle load and store, not atomic read-modify-write insructions. The
1019 // latter use glc to indicate if the atomic returns a result and so must not
1020 // be used for cache control.
1021 assert(MI->mayLoad() ^ MI->mayStore());
1022
1023 // Only update load and store, not LLVM IR atomic read-modify-write
1024 // instructions. The latter are always marked as volatile so cannot sensibly
1025 // handle it as do not want to pessimize all atomics. Also they do not support
1026 // the nontemporal attribute.
1027 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1028
1029 bool Changed = false;
1030
1031 if (IsVolatile) {
1032
1033 if (Op == SIMemOp::LOAD) {
1034 Changed |= enableGLCBit(MI);
1035 Changed |= enableDLCBit(MI);
1036 }
1037
1038 // Ensure operation has completed at system scope to cause all volatile
1039 // operations to be visible outside the program in a global order. Do not
1040 // request cross address space as only the global address space can be
1041 // observable outside the program, so no need to cause a waitcnt for LDS
1042 // address space operations.
1043 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1044 Position::AFTER);
1045 return Changed;
1046 }
1047
1048 if (IsNonTemporal) {
1049 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1050 Changed |= enableSLCBit(MI);
1051 return Changed;
1052 }
1053
1054 return Changed;
1055 }
1056
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1057 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1058 SIAtomicScope Scope,
1059 SIAtomicAddrSpace AddrSpace,
1060 SIMemOp Op,
1061 bool IsCrossAddrSpaceOrdering,
1062 Position Pos) const {
1063 bool Changed = false;
1064
1065 MachineBasicBlock &MBB = *MI->getParent();
1066 DebugLoc DL = MI->getDebugLoc();
1067
1068 if (Pos == Position::AFTER)
1069 ++MI;
1070
1071 bool VMCnt = false;
1072 bool VSCnt = false;
1073 bool LGKMCnt = false;
1074
1075 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1076 SIAtomicAddrSpace::NONE) {
1077 switch (Scope) {
1078 case SIAtomicScope::SYSTEM:
1079 case SIAtomicScope::AGENT:
1080 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1081 VMCnt |= true;
1082 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1083 VSCnt |= true;
1084 break;
1085 case SIAtomicScope::WORKGROUP:
1086 // In WGP mode the waves of a work-group can be executing on either CU of
1087 // the WGP. Therefore need to wait for operations to complete to ensure
1088 // they are visible to waves in the other CU as the L0 is per CU.
1089 // Otherwise in CU mode and all waves of a work-group are on the same CU
1090 // which shares the same L0.
1091 if (!ST.isCuModeEnabled()) {
1092 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1093 VMCnt |= true;
1094 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1095 VSCnt |= true;
1096 }
1097 break;
1098 case SIAtomicScope::WAVEFRONT:
1099 case SIAtomicScope::SINGLETHREAD:
1100 // The L0 cache keeps all memory operations in order for
1101 // work-items in the same wavefront.
1102 break;
1103 default:
1104 llvm_unreachable("Unsupported synchronization scope");
1105 }
1106 }
1107
1108 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1109 switch (Scope) {
1110 case SIAtomicScope::SYSTEM:
1111 case SIAtomicScope::AGENT:
1112 case SIAtomicScope::WORKGROUP:
1113 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1114 // not needed as LDS operations for all waves are executed in a total
1115 // global ordering as observed by all waves. Required if also
1116 // synchronizing with global/GDS memory as LDS operations could be
1117 // reordered with respect to later global/GDS memory operations of the
1118 // same wave.
1119 LGKMCnt |= IsCrossAddrSpaceOrdering;
1120 break;
1121 case SIAtomicScope::WAVEFRONT:
1122 case SIAtomicScope::SINGLETHREAD:
1123 // The LDS keeps all memory operations in order for
1124 // the same wavesfront.
1125 break;
1126 default:
1127 llvm_unreachable("Unsupported synchronization scope");
1128 }
1129 }
1130
1131 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1132 switch (Scope) {
1133 case SIAtomicScope::SYSTEM:
1134 case SIAtomicScope::AGENT:
1135 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1136 // is not needed as GDS operations for all waves are executed in a total
1137 // global ordering as observed by all waves. Required if also
1138 // synchronizing with global/LDS memory as GDS operations could be
1139 // reordered with respect to later global/LDS memory operations of the
1140 // same wave.
1141 LGKMCnt |= IsCrossAddrSpaceOrdering;
1142 break;
1143 case SIAtomicScope::WORKGROUP:
1144 case SIAtomicScope::WAVEFRONT:
1145 case SIAtomicScope::SINGLETHREAD:
1146 // The GDS keeps all memory operations in order for
1147 // the same work-group.
1148 break;
1149 default:
1150 llvm_unreachable("Unsupported synchronization scope");
1151 }
1152 }
1153
1154 if (VMCnt || LGKMCnt) {
1155 unsigned WaitCntImmediate =
1156 AMDGPU::encodeWaitcnt(IV,
1157 VMCnt ? 0 : getVmcntBitMask(IV),
1158 getExpcntBitMask(IV),
1159 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1160 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1161 Changed = true;
1162 }
1163
1164 if (VSCnt) {
1165 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1166 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1167 .addImm(0);
1168 Changed = true;
1169 }
1170
1171 if (Pos == Position::AFTER)
1172 --MI;
1173
1174 return Changed;
1175 }
1176
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1177 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1178 SIAtomicScope Scope,
1179 SIAtomicAddrSpace AddrSpace,
1180 Position Pos) const {
1181 if (!InsertCacheInv)
1182 return false;
1183
1184 bool Changed = false;
1185
1186 MachineBasicBlock &MBB = *MI->getParent();
1187 DebugLoc DL = MI->getDebugLoc();
1188
1189 if (Pos == Position::AFTER)
1190 ++MI;
1191
1192 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1193 switch (Scope) {
1194 case SIAtomicScope::SYSTEM:
1195 case SIAtomicScope::AGENT:
1196 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1197 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1198 Changed = true;
1199 break;
1200 case SIAtomicScope::WORKGROUP:
1201 // In WGP mode the waves of a work-group can be executing on either CU of
1202 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1203 // in CU mode and all waves of a work-group are on the same CU, and so the
1204 // L0 does not need to be invalidated.
1205 if (!ST.isCuModeEnabled()) {
1206 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1207 Changed = true;
1208 }
1209 break;
1210 case SIAtomicScope::WAVEFRONT:
1211 case SIAtomicScope::SINGLETHREAD:
1212 // No cache to invalidate.
1213 break;
1214 default:
1215 llvm_unreachable("Unsupported synchronization scope");
1216 }
1217 }
1218
1219 /// The scratch address space does not need the global memory cache
1220 /// to be flushed as all memory operations by the same thread are
1221 /// sequentially consistent, and no other thread can access scratch
1222 /// memory.
1223
1224 /// Other address spaces do not have a cache.
1225
1226 if (Pos == Position::AFTER)
1227 --MI;
1228
1229 return Changed;
1230 }
1231
removeAtomicPseudoMIs()1232 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1233 if (AtomicPseudoMIs.empty())
1234 return false;
1235
1236 for (auto &MI : AtomicPseudoMIs)
1237 MI->eraseFromParent();
1238
1239 AtomicPseudoMIs.clear();
1240 return true;
1241 }
1242
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1243 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1244 MachineBasicBlock::iterator &MI) {
1245 assert(MI->mayLoad() && !MI->mayStore());
1246
1247 bool Changed = false;
1248
1249 if (MOI.isAtomic()) {
1250 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1251 MOI.getOrdering() == AtomicOrdering::Acquire ||
1252 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1253 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1254 MOI.getOrderingAddrSpace());
1255 }
1256
1257 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1258 Changed |= CC->insertWait(MI, MOI.getScope(),
1259 MOI.getOrderingAddrSpace(),
1260 SIMemOp::LOAD | SIMemOp::STORE,
1261 MOI.getIsCrossAddressSpaceOrdering(),
1262 Position::BEFORE);
1263
1264 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1265 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1266 Changed |= CC->insertWait(MI, MOI.getScope(),
1267 MOI.getInstrAddrSpace(),
1268 SIMemOp::LOAD,
1269 MOI.getIsCrossAddressSpaceOrdering(),
1270 Position::AFTER);
1271 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1272 MOI.getOrderingAddrSpace(),
1273 Position::AFTER);
1274 }
1275
1276 return Changed;
1277 }
1278
1279 // Atomic instructions already bypass caches to the scope specified by the
1280 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1281 // need additional treatment.
1282 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1283 SIMemOp::LOAD, MOI.isVolatile(),
1284 MOI.isNonTemporal());
1285 return Changed;
1286 }
1287
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1288 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1289 MachineBasicBlock::iterator &MI) {
1290 assert(!MI->mayLoad() && MI->mayStore());
1291
1292 bool Changed = false;
1293
1294 if (MOI.isAtomic()) {
1295 if (MOI.getOrdering() == AtomicOrdering::Release ||
1296 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1297 Changed |= CC->insertRelease(MI, MOI.getScope(),
1298 MOI.getOrderingAddrSpace(),
1299 MOI.getIsCrossAddressSpaceOrdering(),
1300 Position::BEFORE);
1301
1302 return Changed;
1303 }
1304
1305 // Atomic instructions already bypass caches to the scope specified by the
1306 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1307 // need additional treatment.
1308 Changed |= CC->enableVolatileAndOrNonTemporal(
1309 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1310 MOI.isNonTemporal());
1311 return Changed;
1312 }
1313
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1314 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1315 MachineBasicBlock::iterator &MI) {
1316 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1317
1318 AtomicPseudoMIs.push_back(MI);
1319 bool Changed = false;
1320
1321 if (MOI.isAtomic()) {
1322 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1323 MOI.getOrdering() == AtomicOrdering::Release ||
1324 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1325 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1326 /// TODO: This relies on a barrier always generating a waitcnt
1327 /// for LDS to ensure it is not reordered with the completion of
1328 /// the proceeding LDS operations. If barrier had a memory
1329 /// ordering and memory scope, then library does not need to
1330 /// generate a fence. Could add support in this file for
1331 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1332 /// adding S_WAITCNT before a S_BARRIER.
1333 Changed |= CC->insertRelease(MI, MOI.getScope(),
1334 MOI.getOrderingAddrSpace(),
1335 MOI.getIsCrossAddressSpaceOrdering(),
1336 Position::BEFORE);
1337
1338 // TODO: If both release and invalidate are happening they could be combined
1339 // to use the single "BUFFER_WBL2" instruction. This could be done by
1340 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1341 // track cache invalidate and write back instructions.
1342
1343 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1344 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1345 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1346 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1347 MOI.getOrderingAddrSpace(),
1348 Position::BEFORE);
1349
1350 return Changed;
1351 }
1352
1353 return Changed;
1354 }
1355
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)1356 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1357 MachineBasicBlock::iterator &MI) {
1358 assert(MI->mayLoad() && MI->mayStore());
1359
1360 bool Changed = false;
1361
1362 if (MOI.isAtomic()) {
1363 if (MOI.getOrdering() == AtomicOrdering::Release ||
1364 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1365 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1366 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1367 Changed |= CC->insertRelease(MI, MOI.getScope(),
1368 MOI.getOrderingAddrSpace(),
1369 MOI.getIsCrossAddressSpaceOrdering(),
1370 Position::BEFORE);
1371
1372 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1373 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1374 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1375 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1376 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1377 Changed |= CC->insertWait(MI, MOI.getScope(),
1378 MOI.getOrderingAddrSpace(),
1379 isAtomicRet(*MI) ? SIMemOp::LOAD :
1380 SIMemOp::STORE,
1381 MOI.getIsCrossAddressSpaceOrdering(),
1382 Position::AFTER);
1383 Changed |= CC->insertAcquire(MI, MOI.getScope(),
1384 MOI.getOrderingAddrSpace(),
1385 Position::AFTER);
1386 }
1387
1388 return Changed;
1389 }
1390
1391 return Changed;
1392 }
1393
runOnMachineFunction(MachineFunction & MF)1394 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1395 bool Changed = false;
1396
1397 SIMemOpAccess MOA(MF);
1398 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1399
1400 for (auto &MBB : MF) {
1401 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1402
1403 // Unbundle instructions after the post-RA scheduler.
1404 if (MI->isBundle()) {
1405 MachineBasicBlock::instr_iterator II(MI->getIterator());
1406 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1407 I != E && I->isBundledWithPred(); ++I) {
1408 I->unbundleFromPred();
1409 for (MachineOperand &MO : I->operands())
1410 if (MO.isReg())
1411 MO.setIsInternalRead(false);
1412 }
1413
1414 MI->eraseFromParent();
1415 MI = II->getIterator();
1416 }
1417
1418 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1419 continue;
1420
1421 if (const auto &MOI = MOA.getLoadInfo(MI))
1422 Changed |= expandLoad(MOI.getValue(), MI);
1423 else if (const auto &MOI = MOA.getStoreInfo(MI))
1424 Changed |= expandStore(MOI.getValue(), MI);
1425 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1426 Changed |= expandAtomicFence(MOI.getValue(), MI);
1427 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1428 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1429 }
1430 }
1431
1432 Changed |= removeAtomicPseudoMIs();
1433 return Changed;
1434 }
1435
1436 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1437
1438 char SIMemoryLegalizer::ID = 0;
1439 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1440
createSIMemoryLegalizerPass()1441 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1442 return new SIMemoryLegalizer();
1443 }
1444