1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9
10define amdgpu_kernel void @global_agent_unordered_load(
11; GFX6-LABEL: global_agent_unordered_load:
12; GFX6:       ; %bb.0: ; %entry
13; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
14; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
15; GFX6-NEXT:    s_mov_b32 s2, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    s_mov_b32 s0, s4
18; GFX6-NEXT:    s_mov_b32 s1, s5
19; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
20; GFX6-NEXT:    s_mov_b32 s4, s6
21; GFX6-NEXT:    s_mov_b32 s5, s7
22; GFX6-NEXT:    s_mov_b32 s6, s2
23; GFX6-NEXT:    s_mov_b32 s7, s3
24; GFX6-NEXT:    s_waitcnt vmcnt(0)
25; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
26; GFX6-NEXT:    s_endpgm
27;
28; GFX7-LABEL: global_agent_unordered_load:
29; GFX7:       ; %bb.0: ; %entry
30; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
31; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX7-NEXT:    v_mov_b32_e32 v0, s0
33; GFX7-NEXT:    v_mov_b32_e32 v1, s1
34; GFX7-NEXT:    flat_load_dword v0, v[0:1]
35; GFX7-NEXT:    v_mov_b32_e32 v2, s2
36; GFX7-NEXT:    v_mov_b32_e32 v3, s3
37; GFX7-NEXT:    s_waitcnt vmcnt(0)
38; GFX7-NEXT:    flat_store_dword v[2:3], v0
39; GFX7-NEXT:    s_endpgm
40;
41; GFX10-WGP-LABEL: global_agent_unordered_load:
42; GFX10-WGP:       ; %bb.0: ; %entry
43; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
44; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
45; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
47; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
48; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
49; GFX10-WGP-NEXT:    s_endpgm
50;
51; GFX10-CU-LABEL: global_agent_unordered_load:
52; GFX10-CU:       ; %bb.0: ; %entry
53; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
54; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
55; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
57; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
59; GFX10-CU-NEXT:    s_endpgm
60;
61; SKIP-CACHE-INV-LABEL: global_agent_unordered_load:
62; SKIP-CACHE-INV:       ; %bb.0: ; %entry
63; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
64; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
66; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
67; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
69; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
70; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
71; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
72; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
73; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
74; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
75; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
76; SKIP-CACHE-INV-NEXT:    s_endpgm
77;
78; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load:
79; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
80; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
81; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
82; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
84; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
85; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
86; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
87;
88; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load:
89; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
90; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
91; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
92; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
94; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
95; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
96; GFX90A-TGSPLIT-NEXT:    s_endpgm
97    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
98entry:
99  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4
100  store i32 %val, i32 addrspace(1)* %out
101  ret void
102}
103
104define amdgpu_kernel void @global_agent_monotonic_load(
105; GFX6-LABEL: global_agent_monotonic_load:
106; GFX6:       ; %bb.0: ; %entry
107; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
108; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
109; GFX6-NEXT:    s_mov_b32 s2, -1
110; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX6-NEXT:    s_mov_b32 s0, s4
112; GFX6-NEXT:    s_mov_b32 s1, s5
113; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
114; GFX6-NEXT:    s_mov_b32 s4, s6
115; GFX6-NEXT:    s_mov_b32 s5, s7
116; GFX6-NEXT:    s_mov_b32 s6, s2
117; GFX6-NEXT:    s_mov_b32 s7, s3
118; GFX6-NEXT:    s_waitcnt vmcnt(0)
119; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
120; GFX6-NEXT:    s_endpgm
121;
122; GFX7-LABEL: global_agent_monotonic_load:
123; GFX7:       ; %bb.0: ; %entry
124; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
125; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX7-NEXT:    v_mov_b32_e32 v0, s0
127; GFX7-NEXT:    v_mov_b32_e32 v1, s1
128; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
129; GFX7-NEXT:    v_mov_b32_e32 v2, s2
130; GFX7-NEXT:    v_mov_b32_e32 v3, s3
131; GFX7-NEXT:    s_waitcnt vmcnt(0)
132; GFX7-NEXT:    flat_store_dword v[2:3], v0
133; GFX7-NEXT:    s_endpgm
134;
135; GFX10-WGP-LABEL: global_agent_monotonic_load:
136; GFX10-WGP:       ; %bb.0: ; %entry
137; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
138; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
139; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
141; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
142; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
143; GFX10-WGP-NEXT:    s_endpgm
144;
145; GFX10-CU-LABEL: global_agent_monotonic_load:
146; GFX10-CU:       ; %bb.0: ; %entry
147; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
148; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
149; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
151; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
152; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
153; GFX10-CU-NEXT:    s_endpgm
154;
155; SKIP-CACHE-INV-LABEL: global_agent_monotonic_load:
156; SKIP-CACHE-INV:       ; %bb.0: ; %entry
157; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
158; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
159; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
160; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
161; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
162; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
163; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
164; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
165; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
166; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
167; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
168; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
169; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
170; SKIP-CACHE-INV-NEXT:    s_endpgm
171;
172; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load:
173; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
174; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
175; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
176; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
178; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
179; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
180; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
181;
182; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load:
183; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
184; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
185; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
186; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
188; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
189; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
190; GFX90A-TGSPLIT-NEXT:    s_endpgm
191    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
192entry:
193  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4
194  store i32 %val, i32 addrspace(1)* %out
195  ret void
196}
197
198define amdgpu_kernel void @global_agent_acquire_load(
199; GFX6-LABEL: global_agent_acquire_load:
200; GFX6:       ; %bb.0: ; %entry
201; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
202; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
203; GFX6-NEXT:    s_mov_b32 s2, -1
204; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX6-NEXT:    s_mov_b32 s0, s4
206; GFX6-NEXT:    s_mov_b32 s1, s5
207; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
208; GFX6-NEXT:    s_waitcnt vmcnt(0)
209; GFX6-NEXT:    buffer_wbinvl1
210; GFX6-NEXT:    s_mov_b32 s4, s6
211; GFX6-NEXT:    s_mov_b32 s5, s7
212; GFX6-NEXT:    s_mov_b32 s6, s2
213; GFX6-NEXT:    s_mov_b32 s7, s3
214; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
215; GFX6-NEXT:    s_endpgm
216;
217; GFX7-LABEL: global_agent_acquire_load:
218; GFX7:       ; %bb.0: ; %entry
219; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
220; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX7-NEXT:    v_mov_b32_e32 v0, s0
222; GFX7-NEXT:    v_mov_b32_e32 v1, s1
223; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
224; GFX7-NEXT:    s_waitcnt vmcnt(0)
225; GFX7-NEXT:    buffer_wbinvl1_vol
226; GFX7-NEXT:    v_mov_b32_e32 v2, s2
227; GFX7-NEXT:    v_mov_b32_e32 v3, s3
228; GFX7-NEXT:    flat_store_dword v[2:3], v0
229; GFX7-NEXT:    s_endpgm
230;
231; GFX10-WGP-LABEL: global_agent_acquire_load:
232; GFX10-WGP:       ; %bb.0: ; %entry
233; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
234; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
235; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
237; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
238; GFX10-WGP-NEXT:    buffer_gl0_inv
239; GFX10-WGP-NEXT:    buffer_gl1_inv
240; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
241; GFX10-WGP-NEXT:    s_endpgm
242;
243; GFX10-CU-LABEL: global_agent_acquire_load:
244; GFX10-CU:       ; %bb.0: ; %entry
245; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
246; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
247; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
249; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
250; GFX10-CU-NEXT:    buffer_gl0_inv
251; GFX10-CU-NEXT:    buffer_gl1_inv
252; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
253; GFX10-CU-NEXT:    s_endpgm
254;
255; SKIP-CACHE-INV-LABEL: global_agent_acquire_load:
256; SKIP-CACHE-INV:       ; %bb.0: ; %entry
257; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
258; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
259; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
260; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
261; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
262; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
263; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
264; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
265; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
266; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
267; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
268; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
269; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
270; SKIP-CACHE-INV-NEXT:    s_endpgm
271;
272; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load:
273; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
274; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
275; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
276; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
278; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
279; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
280; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
281; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
282;
283; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load:
284; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
285; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
286; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
287; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
289; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
290; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
291; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
292; GFX90A-TGSPLIT-NEXT:    s_endpgm
293    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
294entry:
295  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4
296  store i32 %val, i32 addrspace(1)* %out
297  ret void
298}
299
300define amdgpu_kernel void @global_agent_seq_cst_load(
301; GFX6-LABEL: global_agent_seq_cst_load:
302; GFX6:       ; %bb.0: ; %entry
303; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
304; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
305; GFX6-NEXT:    s_mov_b32 s2, -1
306; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX6-NEXT:    s_mov_b32 s0, s4
308; GFX6-NEXT:    s_mov_b32 s1, s5
309; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
310; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
311; GFX6-NEXT:    s_waitcnt vmcnt(0)
312; GFX6-NEXT:    buffer_wbinvl1
313; GFX6-NEXT:    s_mov_b32 s4, s6
314; GFX6-NEXT:    s_mov_b32 s5, s7
315; GFX6-NEXT:    s_mov_b32 s6, s2
316; GFX6-NEXT:    s_mov_b32 s7, s3
317; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
318; GFX6-NEXT:    s_endpgm
319;
320; GFX7-LABEL: global_agent_seq_cst_load:
321; GFX7:       ; %bb.0: ; %entry
322; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
323; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX7-NEXT:    v_mov_b32_e32 v0, s0
325; GFX7-NEXT:    v_mov_b32_e32 v1, s1
326; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
327; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
328; GFX7-NEXT:    s_waitcnt vmcnt(0)
329; GFX7-NEXT:    buffer_wbinvl1_vol
330; GFX7-NEXT:    v_mov_b32_e32 v2, s2
331; GFX7-NEXT:    v_mov_b32_e32 v3, s3
332; GFX7-NEXT:    flat_store_dword v[2:3], v0
333; GFX7-NEXT:    s_endpgm
334;
335; GFX10-WGP-LABEL: global_agent_seq_cst_load:
336; GFX10-WGP:       ; %bb.0: ; %entry
337; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
338; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
339; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
340; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
341; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
342; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
343; GFX10-WGP-NEXT:    buffer_gl0_inv
344; GFX10-WGP-NEXT:    buffer_gl1_inv
345; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
346; GFX10-WGP-NEXT:    s_endpgm
347;
348; GFX10-CU-LABEL: global_agent_seq_cst_load:
349; GFX10-CU:       ; %bb.0: ; %entry
350; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
351; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
352; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
353; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
354; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
355; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
356; GFX10-CU-NEXT:    buffer_gl0_inv
357; GFX10-CU-NEXT:    buffer_gl1_inv
358; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
359; GFX10-CU-NEXT:    s_endpgm
360;
361; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_load:
362; SKIP-CACHE-INV:       ; %bb.0: ; %entry
363; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
364; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
365; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
366; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
368; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
369; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
370; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
371; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
372; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
373; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
374; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
375; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
376; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
377; SKIP-CACHE-INV-NEXT:    s_endpgm
378;
379; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load:
380; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
381; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
382; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
383; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
384; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
385; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
386; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
387; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
388; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
389;
390; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load:
391; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
392; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
393; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
394; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
395; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
396; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
397; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
398; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
399; GFX90A-TGSPLIT-NEXT:    s_endpgm
400    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
401entry:
402  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4
403  store i32 %val, i32 addrspace(1)* %out
404  ret void
405}
406
407define amdgpu_kernel void @global_agent_unordered_store(
408; GFX6-LABEL: global_agent_unordered_store:
409; GFX6:       ; %bb.0: ; %entry
410; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
411; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
412; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
413; GFX6-NEXT:    s_mov_b32 s2, -1
414; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX6-NEXT:    v_mov_b32_e32 v0, s6
416; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
417; GFX6-NEXT:    s_endpgm
418;
419; GFX7-LABEL: global_agent_unordered_store:
420; GFX7:       ; %bb.0: ; %entry
421; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
422; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
423; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX7-NEXT:    v_mov_b32_e32 v2, s2
425; GFX7-NEXT:    v_mov_b32_e32 v0, s0
426; GFX7-NEXT:    v_mov_b32_e32 v1, s1
427; GFX7-NEXT:    flat_store_dword v[0:1], v2
428; GFX7-NEXT:    s_endpgm
429;
430; GFX10-WGP-LABEL: global_agent_unordered_store:
431; GFX10-WGP:       ; %bb.0: ; %entry
432; GFX10-WGP-NEXT:    s_clause 0x1
433; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
434; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
435; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
436; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
438; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
439; GFX10-WGP-NEXT:    s_endpgm
440;
441; GFX10-CU-LABEL: global_agent_unordered_store:
442; GFX10-CU:       ; %bb.0: ; %entry
443; GFX10-CU-NEXT:    s_clause 0x1
444; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
445; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
446; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
447; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
449; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
450; GFX10-CU-NEXT:    s_endpgm
451;
452; SKIP-CACHE-INV-LABEL: global_agent_unordered_store:
453; SKIP-CACHE-INV:       ; %bb.0: ; %entry
454; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
455; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
456; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
457; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
458; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
459; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
460; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
461; SKIP-CACHE-INV-NEXT:    s_endpgm
462;
463; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store:
464; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
465; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
466; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
467; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
468; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
470; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
471; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
472;
473; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store:
474; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
475; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
476; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
477; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
478; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
480; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
481; GFX90A-TGSPLIT-NEXT:    s_endpgm
482    i32 %in, i32 addrspace(1)* %out) {
483entry:
484  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4
485  ret void
486}
487
488define amdgpu_kernel void @global_agent_monotonic_store(
489; GFX6-LABEL: global_agent_monotonic_store:
490; GFX6:       ; %bb.0: ; %entry
491; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
492; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
493; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
494; GFX6-NEXT:    s_mov_b32 s2, -1
495; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX6-NEXT:    v_mov_b32_e32 v0, s6
497; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
498; GFX6-NEXT:    s_endpgm
499;
500; GFX7-LABEL: global_agent_monotonic_store:
501; GFX7:       ; %bb.0: ; %entry
502; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
503; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
504; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX7-NEXT:    v_mov_b32_e32 v2, s2
506; GFX7-NEXT:    v_mov_b32_e32 v0, s0
507; GFX7-NEXT:    v_mov_b32_e32 v1, s1
508; GFX7-NEXT:    flat_store_dword v[0:1], v2
509; GFX7-NEXT:    s_endpgm
510;
511; GFX10-WGP-LABEL: global_agent_monotonic_store:
512; GFX10-WGP:       ; %bb.0: ; %entry
513; GFX10-WGP-NEXT:    s_clause 0x1
514; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
515; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
516; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
517; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
519; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
520; GFX10-WGP-NEXT:    s_endpgm
521;
522; GFX10-CU-LABEL: global_agent_monotonic_store:
523; GFX10-CU:       ; %bb.0: ; %entry
524; GFX10-CU-NEXT:    s_clause 0x1
525; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
526; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
527; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
528; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
530; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
531; GFX10-CU-NEXT:    s_endpgm
532;
533; SKIP-CACHE-INV-LABEL: global_agent_monotonic_store:
534; SKIP-CACHE-INV:       ; %bb.0: ; %entry
535; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
536; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
537; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
539; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
541; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
542; SKIP-CACHE-INV-NEXT:    s_endpgm
543;
544; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store:
545; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
546; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
547; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
548; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
549; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
551; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
552; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
553;
554; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store:
555; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
556; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
557; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
558; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
559; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
561; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
562; GFX90A-TGSPLIT-NEXT:    s_endpgm
563    i32 %in, i32 addrspace(1)* %out) {
564entry:
565  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4
566  ret void
567}
568
569define amdgpu_kernel void @global_agent_release_store(
570; GFX6-LABEL: global_agent_release_store:
571; GFX6:       ; %bb.0: ; %entry
572; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
573; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
574; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
575; GFX6-NEXT:    s_mov_b32 s2, -1
576; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX6-NEXT:    v_mov_b32_e32 v0, s6
578; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
579; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
580; GFX6-NEXT:    s_endpgm
581;
582; GFX7-LABEL: global_agent_release_store:
583; GFX7:       ; %bb.0: ; %entry
584; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
585; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
586; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX7-NEXT:    v_mov_b32_e32 v2, s2
588; GFX7-NEXT:    v_mov_b32_e32 v0, s0
589; GFX7-NEXT:    v_mov_b32_e32 v1, s1
590; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
591; GFX7-NEXT:    flat_store_dword v[0:1], v2
592; GFX7-NEXT:    s_endpgm
593;
594; GFX10-WGP-LABEL: global_agent_release_store:
595; GFX10-WGP:       ; %bb.0: ; %entry
596; GFX10-WGP-NEXT:    s_clause 0x1
597; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
598; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
599; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
600; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
602; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
603; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
604; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
605; GFX10-WGP-NEXT:    s_endpgm
606;
607; GFX10-CU-LABEL: global_agent_release_store:
608; GFX10-CU:       ; %bb.0: ; %entry
609; GFX10-CU-NEXT:    s_clause 0x1
610; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
611; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
612; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
613; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
615; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
616; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
617; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
618; GFX10-CU-NEXT:    s_endpgm
619;
620; SKIP-CACHE-INV-LABEL: global_agent_release_store:
621; SKIP-CACHE-INV:       ; %bb.0: ; %entry
622; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
623; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
624; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
625; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
626; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
627; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
628; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
629; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
630; SKIP-CACHE-INV-NEXT:    s_endpgm
631;
632; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store:
633; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
634; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
635; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
636; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
637; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
639; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
640; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
641; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
642;
643; GFX90A-TGSPLIT-LABEL: global_agent_release_store:
644; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
645; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
646; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
647; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
648; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
650; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
651; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
652; GFX90A-TGSPLIT-NEXT:    s_endpgm
653    i32 %in, i32 addrspace(1)* %out) {
654entry:
655  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4
656  ret void
657}
658
659define amdgpu_kernel void @global_agent_seq_cst_store(
660; GFX6-LABEL: global_agent_seq_cst_store:
661; GFX6:       ; %bb.0: ; %entry
662; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
663; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
664; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
665; GFX6-NEXT:    s_mov_b32 s2, -1
666; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX6-NEXT:    v_mov_b32_e32 v0, s6
668; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
669; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
670; GFX6-NEXT:    s_endpgm
671;
672; GFX7-LABEL: global_agent_seq_cst_store:
673; GFX7:       ; %bb.0: ; %entry
674; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
675; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
676; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX7-NEXT:    v_mov_b32_e32 v2, s2
678; GFX7-NEXT:    v_mov_b32_e32 v0, s0
679; GFX7-NEXT:    v_mov_b32_e32 v1, s1
680; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; GFX7-NEXT:    flat_store_dword v[0:1], v2
682; GFX7-NEXT:    s_endpgm
683;
684; GFX10-WGP-LABEL: global_agent_seq_cst_store:
685; GFX10-WGP:       ; %bb.0: ; %entry
686; GFX10-WGP-NEXT:    s_clause 0x1
687; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
688; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
689; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
690; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
692; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
693; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
694; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
695; GFX10-WGP-NEXT:    s_endpgm
696;
697; GFX10-CU-LABEL: global_agent_seq_cst_store:
698; GFX10-CU:       ; %bb.0: ; %entry
699; GFX10-CU-NEXT:    s_clause 0x1
700; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
701; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
702; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
703; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
705; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
706; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
707; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
708; GFX10-CU-NEXT:    s_endpgm
709;
710; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_store:
711; SKIP-CACHE-INV:       ; %bb.0: ; %entry
712; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
713; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
714; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
715; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
716; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
717; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
718; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
719; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
720; SKIP-CACHE-INV-NEXT:    s_endpgm
721;
722; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store:
723; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
724; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
725; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
726; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
727; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
729; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
730; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
731; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
732;
733; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store:
734; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
735; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
736; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
737; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
738; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
740; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
741; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
742; GFX90A-TGSPLIT-NEXT:    s_endpgm
743    i32 %in, i32 addrspace(1)* %out) {
744entry:
745  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4
746  ret void
747}
748
749define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
750; GFX6-LABEL: global_agent_monotonic_atomicrmw:
751; GFX6:       ; %bb.0: ; %entry
752; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
753; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
754; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
755; GFX6-NEXT:    s_mov_b32 s2, -1
756; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX6-NEXT:    v_mov_b32_e32 v0, s4
758; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
759; GFX6-NEXT:    s_endpgm
760;
761; GFX7-LABEL: global_agent_monotonic_atomicrmw:
762; GFX7:       ; %bb.0: ; %entry
763; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
764; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
765; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX7-NEXT:    v_mov_b32_e32 v0, s0
767; GFX7-NEXT:    v_mov_b32_e32 v1, s1
768; GFX7-NEXT:    v_mov_b32_e32 v2, s2
769; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
770; GFX7-NEXT:    s_endpgm
771;
772; GFX10-WGP-LABEL: global_agent_monotonic_atomicrmw:
773; GFX10-WGP:       ; %bb.0: ; %entry
774; GFX10-WGP-NEXT:    s_clause 0x1
775; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
776; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
777; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
778; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
780; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
781; GFX10-WGP-NEXT:    s_endpgm
782;
783; GFX10-CU-LABEL: global_agent_monotonic_atomicrmw:
784; GFX10-CU:       ; %bb.0: ; %entry
785; GFX10-CU-NEXT:    s_clause 0x1
786; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
787; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
788; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
789; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
791; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
792; GFX10-CU-NEXT:    s_endpgm
793;
794; SKIP-CACHE-INV-LABEL: global_agent_monotonic_atomicrmw:
795; SKIP-CACHE-INV:       ; %bb.0: ; %entry
796; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
797; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
798; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
799; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
802; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
803; SKIP-CACHE-INV-NEXT:    s_endpgm
804;
805; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
806; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
807; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
808; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
809; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
810; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
812; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
813; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
814;
815; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
816; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
817; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
818; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
819; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
820; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
822; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
823; GFX90A-TGSPLIT-NEXT:    s_endpgm
824    i32 addrspace(1)* %out, i32 %in) {
825entry:
826  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic
827  ret void
828}
829
830define amdgpu_kernel void @global_agent_acquire_atomicrmw(
831; GFX6-LABEL: global_agent_acquire_atomicrmw:
832; GFX6:       ; %bb.0: ; %entry
833; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
834; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
835; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
836; GFX6-NEXT:    s_mov_b32 s2, -1
837; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
838; GFX6-NEXT:    v_mov_b32_e32 v0, s4
839; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
840; GFX6-NEXT:    s_waitcnt vmcnt(0)
841; GFX6-NEXT:    buffer_wbinvl1
842; GFX6-NEXT:    s_endpgm
843;
844; GFX7-LABEL: global_agent_acquire_atomicrmw:
845; GFX7:       ; %bb.0: ; %entry
846; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
847; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
848; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX7-NEXT:    v_mov_b32_e32 v0, s0
850; GFX7-NEXT:    v_mov_b32_e32 v1, s1
851; GFX7-NEXT:    v_mov_b32_e32 v2, s2
852; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
853; GFX7-NEXT:    s_waitcnt vmcnt(0)
854; GFX7-NEXT:    buffer_wbinvl1_vol
855; GFX7-NEXT:    s_endpgm
856;
857; GFX10-WGP-LABEL: global_agent_acquire_atomicrmw:
858; GFX10-WGP:       ; %bb.0: ; %entry
859; GFX10-WGP-NEXT:    s_clause 0x1
860; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
861; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
862; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
863; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
865; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
866; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
867; GFX10-WGP-NEXT:    buffer_gl0_inv
868; GFX10-WGP-NEXT:    buffer_gl1_inv
869; GFX10-WGP-NEXT:    s_endpgm
870;
871; GFX10-CU-LABEL: global_agent_acquire_atomicrmw:
872; GFX10-CU:       ; %bb.0: ; %entry
873; GFX10-CU-NEXT:    s_clause 0x1
874; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
875; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
876; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
877; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
879; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
880; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
881; GFX10-CU-NEXT:    buffer_gl0_inv
882; GFX10-CU-NEXT:    buffer_gl1_inv
883; GFX10-CU-NEXT:    s_endpgm
884;
885; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw:
886; SKIP-CACHE-INV:       ; %bb.0: ; %entry
887; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
888; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
889; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
890; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
891; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
893; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
894; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
895; SKIP-CACHE-INV-NEXT:    s_endpgm
896;
897; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw:
898; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
899; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
900; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
901; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
902; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
904; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
905; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
906; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
907; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
908;
909; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw:
910; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
911; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
912; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
913; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
914; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
916; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
917; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
918; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
919; GFX90A-TGSPLIT-NEXT:    s_endpgm
920    i32 addrspace(1)* %out, i32 %in) {
921entry:
922  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
923  ret void
924}
925
926define amdgpu_kernel void @global_agent_release_atomicrmw(
927; GFX6-LABEL: global_agent_release_atomicrmw:
928; GFX6:       ; %bb.0: ; %entry
929; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
930; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
931; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
932; GFX6-NEXT:    s_mov_b32 s2, -1
933; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX6-NEXT:    v_mov_b32_e32 v0, s4
935; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
936; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
937; GFX6-NEXT:    s_endpgm
938;
939; GFX7-LABEL: global_agent_release_atomicrmw:
940; GFX7:       ; %bb.0: ; %entry
941; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
942; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
943; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX7-NEXT:    v_mov_b32_e32 v0, s0
945; GFX7-NEXT:    v_mov_b32_e32 v1, s1
946; GFX7-NEXT:    v_mov_b32_e32 v2, s2
947; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
948; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
949; GFX7-NEXT:    s_endpgm
950;
951; GFX10-WGP-LABEL: global_agent_release_atomicrmw:
952; GFX10-WGP:       ; %bb.0: ; %entry
953; GFX10-WGP-NEXT:    s_clause 0x1
954; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
955; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
956; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
957; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
959; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
960; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
961; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
962; GFX10-WGP-NEXT:    s_endpgm
963;
964; GFX10-CU-LABEL: global_agent_release_atomicrmw:
965; GFX10-CU:       ; %bb.0: ; %entry
966; GFX10-CU-NEXT:    s_clause 0x1
967; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
968; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
969; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
970; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
972; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
973; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
974; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
975; GFX10-CU-NEXT:    s_endpgm
976;
977; SKIP-CACHE-INV-LABEL: global_agent_release_atomicrmw:
978; SKIP-CACHE-INV:       ; %bb.0: ; %entry
979; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
980; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
981; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
982; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
983; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
985; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
987; SKIP-CACHE-INV-NEXT:    s_endpgm
988;
989; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw:
990; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
991; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
992; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
993; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
994; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
996; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
997; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
998; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
999;
1000; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw:
1001; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1002; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1003; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1004; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1005; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1007; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1008; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1009; GFX90A-TGSPLIT-NEXT:    s_endpgm
1010    i32 addrspace(1)* %out, i32 %in) {
1011entry:
1012  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release
1013  ret void
1014}
1015
1016define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
1017; GFX6-LABEL: global_agent_acq_rel_atomicrmw:
1018; GFX6:       ; %bb.0: ; %entry
1019; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1020; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1021; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1022; GFX6-NEXT:    s_mov_b32 s2, -1
1023; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1025; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1026; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1027; GFX6-NEXT:    s_waitcnt vmcnt(0)
1028; GFX6-NEXT:    buffer_wbinvl1
1029; GFX6-NEXT:    s_endpgm
1030;
1031; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
1032; GFX7:       ; %bb.0: ; %entry
1033; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1034; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1035; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1037; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1038; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1039; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1040; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1041; GFX7-NEXT:    s_waitcnt vmcnt(0)
1042; GFX7-NEXT:    buffer_wbinvl1_vol
1043; GFX7-NEXT:    s_endpgm
1044;
1045; GFX10-WGP-LABEL: global_agent_acq_rel_atomicrmw:
1046; GFX10-WGP:       ; %bb.0: ; %entry
1047; GFX10-WGP-NEXT:    s_clause 0x1
1048; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1049; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1050; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1053; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1054; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1055; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
1056; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1057; GFX10-WGP-NEXT:    buffer_gl0_inv
1058; GFX10-WGP-NEXT:    buffer_gl1_inv
1059; GFX10-WGP-NEXT:    s_endpgm
1060;
1061; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw:
1062; GFX10-CU:       ; %bb.0: ; %entry
1063; GFX10-CU-NEXT:    s_clause 0x1
1064; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1065; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1066; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1067; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1068; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1069; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1070; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1071; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
1072; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1073; GFX10-CU-NEXT:    buffer_gl0_inv
1074; GFX10-CU-NEXT:    buffer_gl1_inv
1075; GFX10-CU-NEXT:    s_endpgm
1076;
1077; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw:
1078; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1079; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1080; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1081; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1082; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1083; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1084; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1085; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1086; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1087; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1088; SKIP-CACHE-INV-NEXT:    s_endpgm
1089;
1090; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
1091; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1092; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1093; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1094; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1095; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1097; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1099; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1100; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1101; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1102;
1103; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
1104; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1105; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1106; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1107; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1108; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1110; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1111; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1112; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1113; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1114; GFX90A-TGSPLIT-NEXT:    s_endpgm
1115    i32 addrspace(1)* %out, i32 %in) {
1116entry:
1117  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
1118  ret void
1119}
1120
1121define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
1122; GFX6-LABEL: global_agent_seq_cst_atomicrmw:
1123; GFX6:       ; %bb.0: ; %entry
1124; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1125; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1126; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1127; GFX6-NEXT:    s_mov_b32 s2, -1
1128; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1130; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1131; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1132; GFX6-NEXT:    s_waitcnt vmcnt(0)
1133; GFX6-NEXT:    buffer_wbinvl1
1134; GFX6-NEXT:    s_endpgm
1135;
1136; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
1137; GFX7:       ; %bb.0: ; %entry
1138; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1139; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1140; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1142; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1143; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1144; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1145; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1146; GFX7-NEXT:    s_waitcnt vmcnt(0)
1147; GFX7-NEXT:    buffer_wbinvl1_vol
1148; GFX7-NEXT:    s_endpgm
1149;
1150; GFX10-WGP-LABEL: global_agent_seq_cst_atomicrmw:
1151; GFX10-WGP:       ; %bb.0: ; %entry
1152; GFX10-WGP-NEXT:    s_clause 0x1
1153; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1154; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1155; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1156; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1158; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1159; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1160; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
1161; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1162; GFX10-WGP-NEXT:    buffer_gl0_inv
1163; GFX10-WGP-NEXT:    buffer_gl1_inv
1164; GFX10-WGP-NEXT:    s_endpgm
1165;
1166; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw:
1167; GFX10-CU:       ; %bb.0: ; %entry
1168; GFX10-CU-NEXT:    s_clause 0x1
1169; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1170; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1171; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1172; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1174; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1175; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1176; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
1177; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1178; GFX10-CU-NEXT:    buffer_gl0_inv
1179; GFX10-CU-NEXT:    buffer_gl1_inv
1180; GFX10-CU-NEXT:    s_endpgm
1181;
1182; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw:
1183; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1184; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1185; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1186; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1187; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1188; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1190; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1191; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1192; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1193; SKIP-CACHE-INV-NEXT:    s_endpgm
1194;
1195; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
1196; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1198; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1199; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1200; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1203; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1204; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1205; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1206; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1207;
1208; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
1209; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1210; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1211; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1212; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1213; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1215; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1216; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1217; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1218; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1219; GFX90A-TGSPLIT-NEXT:    s_endpgm
1220    i32 addrspace(1)* %out, i32 %in) {
1221entry:
1222  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
1223  ret void
1224}
1225
1226define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
1227; GFX6-LABEL: global_agent_acquire_ret_atomicrmw:
1228; GFX6:       ; %bb.0: ; %entry
1229; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1230; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1231; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1232; GFX6-NEXT:    s_mov_b32 s2, -1
1233; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1234; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1235; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1236; GFX6-NEXT:    s_waitcnt vmcnt(0)
1237; GFX6-NEXT:    buffer_wbinvl1
1238; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1239; GFX6-NEXT:    s_endpgm
1240;
1241; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
1242; GFX7:       ; %bb.0: ; %entry
1243; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1244; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1245; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1247; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1248; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1249; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1250; GFX7-NEXT:    s_waitcnt vmcnt(0)
1251; GFX7-NEXT:    buffer_wbinvl1_vol
1252; GFX7-NEXT:    flat_store_dword v[0:1], v2
1253; GFX7-NEXT:    s_endpgm
1254;
1255; GFX10-WGP-LABEL: global_agent_acquire_ret_atomicrmw:
1256; GFX10-WGP:       ; %bb.0: ; %entry
1257; GFX10-WGP-NEXT:    s_clause 0x1
1258; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1259; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1260; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1261; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1262; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1263; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1264; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1265; GFX10-WGP-NEXT:    buffer_gl0_inv
1266; GFX10-WGP-NEXT:    buffer_gl1_inv
1267; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1268; GFX10-WGP-NEXT:    s_endpgm
1269;
1270; GFX10-CU-LABEL: global_agent_acquire_ret_atomicrmw:
1271; GFX10-CU:       ; %bb.0: ; %entry
1272; GFX10-CU-NEXT:    s_clause 0x1
1273; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1274; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1275; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1276; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1278; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1279; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1280; GFX10-CU-NEXT:    buffer_gl0_inv
1281; GFX10-CU-NEXT:    buffer_gl1_inv
1282; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1283; GFX10-CU-NEXT:    s_endpgm
1284;
1285; SKIP-CACHE-INV-LABEL: global_agent_acquire_ret_atomicrmw:
1286; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1287; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1288; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1289; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1290; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1291; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1292; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1293; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1294; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1295; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1296; SKIP-CACHE-INV-NEXT:    s_endpgm
1297;
1298; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
1299; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1300; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1301; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1302; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1303; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1305; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1306; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1307; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1308; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1309; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1310;
1311; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
1312; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1313; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1314; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1315; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1316; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1318; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1319; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1320; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1321; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1322; GFX90A-TGSPLIT-NEXT:    s_endpgm
1323    i32 addrspace(1)* %out, i32 %in) {
1324entry:
1325  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
1326  store i32 %val, i32 addrspace(1)* %out, align 4
1327  ret void
1328}
1329
1330define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
1331; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw:
1332; GFX6:       ; %bb.0: ; %entry
1333; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1334; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1335; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1336; GFX6-NEXT:    s_mov_b32 s2, -1
1337; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1339; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1340; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1341; GFX6-NEXT:    s_waitcnt vmcnt(0)
1342; GFX6-NEXT:    buffer_wbinvl1
1343; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1344; GFX6-NEXT:    s_endpgm
1345;
1346; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
1347; GFX7:       ; %bb.0: ; %entry
1348; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1349; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1350; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1351; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1352; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1353; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1354; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1355; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1356; GFX7-NEXT:    s_waitcnt vmcnt(0)
1357; GFX7-NEXT:    buffer_wbinvl1_vol
1358; GFX7-NEXT:    flat_store_dword v[0:1], v2
1359; GFX7-NEXT:    s_endpgm
1360;
1361; GFX10-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw:
1362; GFX10-WGP:       ; %bb.0: ; %entry
1363; GFX10-WGP-NEXT:    s_clause 0x1
1364; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1365; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1366; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1367; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1369; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1370; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1371; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1372; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1373; GFX10-WGP-NEXT:    buffer_gl0_inv
1374; GFX10-WGP-NEXT:    buffer_gl1_inv
1375; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1376; GFX10-WGP-NEXT:    s_endpgm
1377;
1378; GFX10-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
1379; GFX10-CU:       ; %bb.0: ; %entry
1380; GFX10-CU-NEXT:    s_clause 0x1
1381; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1382; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1383; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1384; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1386; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1387; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1388; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1389; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1390; GFX10-CU-NEXT:    buffer_gl0_inv
1391; GFX10-CU-NEXT:    buffer_gl1_inv
1392; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1393; GFX10-CU-NEXT:    s_endpgm
1394;
1395; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_ret_atomicrmw:
1396; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1397; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1398; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1399; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1400; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1401; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1402; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1403; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1405; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1406; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1407; SKIP-CACHE-INV-NEXT:    s_endpgm
1408;
1409; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
1410; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1411; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1412; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1413; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1414; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1415; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1416; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1417; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1418; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1419; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1420; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1421; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1422;
1423; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
1424; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1425; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1426; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1427; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1428; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1429; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1430; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1431; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1432; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1433; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1434; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1435; GFX90A-TGSPLIT-NEXT:    s_endpgm
1436    i32 addrspace(1)* %out, i32 %in) {
1437entry:
1438  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
1439  store i32 %val, i32 addrspace(1)* %out, align 4
1440  ret void
1441}
1442
1443define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
1444; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw:
1445; GFX6:       ; %bb.0: ; %entry
1446; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1447; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1448; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1449; GFX6-NEXT:    s_mov_b32 s2, -1
1450; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1452; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1453; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1454; GFX6-NEXT:    s_waitcnt vmcnt(0)
1455; GFX6-NEXT:    buffer_wbinvl1
1456; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1457; GFX6-NEXT:    s_endpgm
1458;
1459; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
1460; GFX7:       ; %bb.0: ; %entry
1461; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1462; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1463; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1465; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1466; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1467; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1468; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1469; GFX7-NEXT:    s_waitcnt vmcnt(0)
1470; GFX7-NEXT:    buffer_wbinvl1_vol
1471; GFX7-NEXT:    flat_store_dword v[0:1], v2
1472; GFX7-NEXT:    s_endpgm
1473;
1474; GFX10-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw:
1475; GFX10-WGP:       ; %bb.0: ; %entry
1476; GFX10-WGP-NEXT:    s_clause 0x1
1477; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1478; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1479; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1480; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1481; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1482; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1483; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1484; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1485; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1486; GFX10-WGP-NEXT:    buffer_gl0_inv
1487; GFX10-WGP-NEXT:    buffer_gl1_inv
1488; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1489; GFX10-WGP-NEXT:    s_endpgm
1490;
1491; GFX10-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
1492; GFX10-CU:       ; %bb.0: ; %entry
1493; GFX10-CU-NEXT:    s_clause 0x1
1494; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1495; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1496; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1497; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1499; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1500; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1501; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1502; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1503; GFX10-CU-NEXT:    buffer_gl0_inv
1504; GFX10-CU-NEXT:    buffer_gl1_inv
1505; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1506; GFX10-CU-NEXT:    s_endpgm
1507;
1508; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_ret_atomicrmw:
1509; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1510; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1511; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1512; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1513; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1514; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1516; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1517; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1518; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1519; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1520; SKIP-CACHE-INV-NEXT:    s_endpgm
1521;
1522; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
1523; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1524; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1525; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1526; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1527; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1529; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1530; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1531; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1532; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1533; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1534; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1535;
1536; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
1537; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1538; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1539; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1540; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1541; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1542; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1543; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1544; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1545; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1546; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1547; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1548; GFX90A-TGSPLIT-NEXT:    s_endpgm
1549    i32 addrspace(1)* %out, i32 %in) {
1550entry:
1551  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
1552  store i32 %val, i32 addrspace(1)* %out, align 4
1553  ret void
1554}
1555
1556define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
1557; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1558; GFX6:       ; %bb.0: ; %entry
1559; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1560; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1561; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1562; GFX6-NEXT:    s_mov_b32 s2, -1
1563; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1565; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1566; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1567; GFX6-NEXT:    s_endpgm
1568;
1569; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1570; GFX7:       ; %bb.0: ; %entry
1571; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1572; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1573; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX7-NEXT:    s_add_u32 s0, s0, 16
1575; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1576; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1577; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1578; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1579; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1580; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1581; GFX7-NEXT:    s_endpgm
1582;
1583; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1584; GFX10-WGP:       ; %bb.0: ; %entry
1585; GFX10-WGP-NEXT:    s_clause 0x1
1586; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1587; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1588; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1589; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1590; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1591; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1592; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1593; GFX10-WGP-NEXT:    s_endpgm
1594;
1595; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1596; GFX10-CU:       ; %bb.0: ; %entry
1597; GFX10-CU-NEXT:    s_clause 0x1
1598; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1599; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1600; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1601; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1602; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1603; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1604; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1605; GFX10-CU-NEXT:    s_endpgm
1606;
1607; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1608; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1609; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1610; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1611; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1612; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1613; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1616; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1617; SKIP-CACHE-INV-NEXT:    s_endpgm
1618;
1619; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1620; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1621; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1622; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1623; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1624; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1626; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1627; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1628;
1629; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
1630; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1631; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1632; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1633; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1634; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1635; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1636; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1637; GFX90A-TGSPLIT-NEXT:    s_endpgm
1638    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1639entry:
1640  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1641  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
1642  ret void
1643}
1644
1645define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
1646; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg:
1647; GFX6:       ; %bb.0: ; %entry
1648; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1649; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1650; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1651; GFX6-NEXT:    s_mov_b32 s2, -1
1652; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1653; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1654; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1655; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1656; GFX6-NEXT:    s_waitcnt vmcnt(0)
1657; GFX6-NEXT:    buffer_wbinvl1
1658; GFX6-NEXT:    s_endpgm
1659;
1660; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
1661; GFX7:       ; %bb.0: ; %entry
1662; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1663; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1664; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX7-NEXT:    s_add_u32 s0, s0, 16
1666; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1667; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1668; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1669; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1670; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1671; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1672; GFX7-NEXT:    s_waitcnt vmcnt(0)
1673; GFX7-NEXT:    buffer_wbinvl1_vol
1674; GFX7-NEXT:    s_endpgm
1675;
1676; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
1677; GFX10-WGP:       ; %bb.0: ; %entry
1678; GFX10-WGP-NEXT:    s_clause 0x1
1679; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1680; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1681; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1682; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1684; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1685; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1686; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1687; GFX10-WGP-NEXT:    buffer_gl0_inv
1688; GFX10-WGP-NEXT:    buffer_gl1_inv
1689; GFX10-WGP-NEXT:    s_endpgm
1690;
1691; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
1692; GFX10-CU:       ; %bb.0: ; %entry
1693; GFX10-CU-NEXT:    s_clause 0x1
1694; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1695; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1696; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1697; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1698; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1699; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1700; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1701; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1702; GFX10-CU-NEXT:    buffer_gl0_inv
1703; GFX10-CU-NEXT:    buffer_gl1_inv
1704; GFX10-CU-NEXT:    s_endpgm
1705;
1706; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg:
1707; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1708; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1709; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1710; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1711; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1712; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1715; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1716; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1717; SKIP-CACHE-INV-NEXT:    s_endpgm
1718;
1719; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
1720; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1721; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1722; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1723; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1724; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1726; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1727; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1728; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1729; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1730;
1731; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
1732; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1733; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1734; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1735; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1736; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1738; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1739; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1740; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1741; GFX90A-TGSPLIT-NEXT:    s_endpgm
1742    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1743entry:
1744  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1745  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
1746  ret void
1747}
1748
1749define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
1750; GFX6-LABEL: global_agent_release_monotonic_cmpxchg:
1751; GFX6:       ; %bb.0: ; %entry
1752; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1753; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1754; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1755; GFX6-NEXT:    s_mov_b32 s2, -1
1756; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1757; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1758; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1759; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1760; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1761; GFX6-NEXT:    s_endpgm
1762;
1763; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
1764; GFX7:       ; %bb.0: ; %entry
1765; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1766; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1767; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX7-NEXT:    s_add_u32 s0, s0, 16
1769; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1770; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1771; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1772; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1773; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1774; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1775; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1776; GFX7-NEXT:    s_endpgm
1777;
1778; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
1779; GFX10-WGP:       ; %bb.0: ; %entry
1780; GFX10-WGP-NEXT:    s_clause 0x1
1781; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1782; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1783; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1784; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1785; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1786; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1787; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1788; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1789; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1790; GFX10-WGP-NEXT:    s_endpgm
1791;
1792; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg:
1793; GFX10-CU:       ; %bb.0: ; %entry
1794; GFX10-CU-NEXT:    s_clause 0x1
1795; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1796; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1797; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1798; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1800; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1801; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1802; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1803; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1804; GFX10-CU-NEXT:    s_endpgm
1805;
1806; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg:
1807; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1808; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1809; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1810; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1811; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1812; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1815; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1816; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1817; SKIP-CACHE-INV-NEXT:    s_endpgm
1818;
1819; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
1820; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1821; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1822; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1823; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1824; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1826; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1827; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1828; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1829;
1830; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
1831; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1832; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1833; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1834; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1835; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1836; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1837; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1838; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1839; GFX90A-TGSPLIT-NEXT:    s_endpgm
1840    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1841entry:
1842  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1843  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic
1844  ret void
1845}
1846
1847define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
1848; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1849; GFX6:       ; %bb.0: ; %entry
1850; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1851; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1852; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1853; GFX6-NEXT:    s_mov_b32 s2, -1
1854; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1856; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1857; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1858; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1859; GFX6-NEXT:    s_waitcnt vmcnt(0)
1860; GFX6-NEXT:    buffer_wbinvl1
1861; GFX6-NEXT:    s_endpgm
1862;
1863; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1864; GFX7:       ; %bb.0: ; %entry
1865; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1866; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1867; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1868; GFX7-NEXT:    s_add_u32 s0, s0, 16
1869; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1870; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1871; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1872; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1873; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1874; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1875; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1876; GFX7-NEXT:    s_waitcnt vmcnt(0)
1877; GFX7-NEXT:    buffer_wbinvl1_vol
1878; GFX7-NEXT:    s_endpgm
1879;
1880; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1881; GFX10-WGP:       ; %bb.0: ; %entry
1882; GFX10-WGP-NEXT:    s_clause 0x1
1883; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1884; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1885; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1886; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1888; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1889; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1890; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1891; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1892; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1893; GFX10-WGP-NEXT:    buffer_gl0_inv
1894; GFX10-WGP-NEXT:    buffer_gl1_inv
1895; GFX10-WGP-NEXT:    s_endpgm
1896;
1897; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1898; GFX10-CU:       ; %bb.0: ; %entry
1899; GFX10-CU-NEXT:    s_clause 0x1
1900; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1901; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1902; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1903; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1904; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1905; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1906; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1907; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1908; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1909; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1910; GFX10-CU-NEXT:    buffer_gl0_inv
1911; GFX10-CU-NEXT:    buffer_gl1_inv
1912; GFX10-CU-NEXT:    s_endpgm
1913;
1914; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1915; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1916; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1917; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1918; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1919; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1920; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1922; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1923; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1924; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1925; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1926; SKIP-CACHE-INV-NEXT:    s_endpgm
1927;
1928; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1929; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1930; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1931; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1932; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1933; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1934; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1935; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1936; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1937; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1938; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1939; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1940;
1941; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
1942; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1943; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1944; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1945; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1946; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1947; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1948; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1949; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1950; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1951; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1952; GFX90A-TGSPLIT-NEXT:    s_endpgm
1953    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1954entry:
1955  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1956  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
1957  ret void
1958}
1959
1960define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
1961; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
1962; GFX6:       ; %bb.0: ; %entry
1963; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1964; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1965; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1966; GFX6-NEXT:    s_mov_b32 s2, -1
1967; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1969; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1970; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1971; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1972; GFX6-NEXT:    s_waitcnt vmcnt(0)
1973; GFX6-NEXT:    buffer_wbinvl1
1974; GFX6-NEXT:    s_endpgm
1975;
1976; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
1977; GFX7:       ; %bb.0: ; %entry
1978; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1979; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1980; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1981; GFX7-NEXT:    s_add_u32 s0, s0, 16
1982; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1983; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1984; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1985; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1986; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1987; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1988; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1989; GFX7-NEXT:    s_waitcnt vmcnt(0)
1990; GFX7-NEXT:    buffer_wbinvl1_vol
1991; GFX7-NEXT:    s_endpgm
1992;
1993; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
1994; GFX10-WGP:       ; %bb.0: ; %entry
1995; GFX10-WGP-NEXT:    s_clause 0x1
1996; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1997; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1998; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1999; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2000; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2001; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2002; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2003; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2004; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2005; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2006; GFX10-WGP-NEXT:    buffer_gl0_inv
2007; GFX10-WGP-NEXT:    buffer_gl1_inv
2008; GFX10-WGP-NEXT:    s_endpgm
2009;
2010; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
2011; GFX10-CU:       ; %bb.0: ; %entry
2012; GFX10-CU-NEXT:    s_clause 0x1
2013; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2014; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2015; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2016; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2017; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2018; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2019; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2020; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2021; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2022; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2023; GFX10-CU-NEXT:    buffer_gl0_inv
2024; GFX10-CU-NEXT:    buffer_gl1_inv
2025; GFX10-CU-NEXT:    s_endpgm
2026;
2027; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
2028; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2029; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2030; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2031; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2032; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2033; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2034; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2036; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2037; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2038; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2039; SKIP-CACHE-INV-NEXT:    s_endpgm
2040;
2041; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
2042; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2043; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2044; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2045; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2046; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2047; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2048; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2049; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2050; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2051; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2052; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2053;
2054; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
2055; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2056; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2057; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2058; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2059; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2060; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2061; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2062; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2063; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2064; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2065; GFX90A-TGSPLIT-NEXT:    s_endpgm
2066    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2067entry:
2068  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2069  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
2070  ret void
2071}
2072
2073define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
2074; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg:
2075; GFX6:       ; %bb.0: ; %entry
2076; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2077; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2078; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2079; GFX6-NEXT:    s_mov_b32 s2, -1
2080; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2081; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2082; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2083; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2084; GFX6-NEXT:    s_waitcnt vmcnt(0)
2085; GFX6-NEXT:    buffer_wbinvl1
2086; GFX6-NEXT:    s_endpgm
2087;
2088; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
2089; GFX7:       ; %bb.0: ; %entry
2090; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2091; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2092; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2093; GFX7-NEXT:    s_add_u32 s0, s0, 16
2094; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2095; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2096; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2097; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2098; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2099; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2100; GFX7-NEXT:    s_waitcnt vmcnt(0)
2101; GFX7-NEXT:    buffer_wbinvl1_vol
2102; GFX7-NEXT:    s_endpgm
2103;
2104; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
2105; GFX10-WGP:       ; %bb.0: ; %entry
2106; GFX10-WGP-NEXT:    s_clause 0x1
2107; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2108; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2109; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2110; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2111; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2112; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2113; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2114; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2115; GFX10-WGP-NEXT:    buffer_gl0_inv
2116; GFX10-WGP-NEXT:    buffer_gl1_inv
2117; GFX10-WGP-NEXT:    s_endpgm
2118;
2119; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
2120; GFX10-CU:       ; %bb.0: ; %entry
2121; GFX10-CU-NEXT:    s_clause 0x1
2122; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2123; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2124; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2125; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2126; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2127; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2128; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2129; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2130; GFX10-CU-NEXT:    buffer_gl0_inv
2131; GFX10-CU-NEXT:    buffer_gl1_inv
2132; GFX10-CU-NEXT:    s_endpgm
2133;
2134; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg:
2135; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2136; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2137; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2138; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2139; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2140; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2143; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2144; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2145; SKIP-CACHE-INV-NEXT:    s_endpgm
2146;
2147; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
2148; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2149; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2150; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2151; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2152; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2153; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2154; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2155; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2156; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2157; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2158;
2159; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
2160; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2161; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2162; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2163; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2164; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2166; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2167; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2168; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2169; GFX90A-TGSPLIT-NEXT:    s_endpgm
2170    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2171entry:
2172  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2173  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
2174  ret void
2175}
2176
2177define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
2178; GFX6-LABEL: global_agent_release_acquire_cmpxchg:
2179; GFX6:       ; %bb.0: ; %entry
2180; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2181; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2182; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2183; GFX6-NEXT:    s_mov_b32 s2, -1
2184; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2185; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2186; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2187; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2188; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2189; GFX6-NEXT:    s_waitcnt vmcnt(0)
2190; GFX6-NEXT:    buffer_wbinvl1
2191; GFX6-NEXT:    s_endpgm
2192;
2193; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
2194; GFX7:       ; %bb.0: ; %entry
2195; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2196; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2197; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX7-NEXT:    s_add_u32 s0, s0, 16
2199; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2200; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2201; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2202; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2203; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2204; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2205; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2206; GFX7-NEXT:    s_waitcnt vmcnt(0)
2207; GFX7-NEXT:    buffer_wbinvl1_vol
2208; GFX7-NEXT:    s_endpgm
2209;
2210; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg:
2211; GFX10-WGP:       ; %bb.0: ; %entry
2212; GFX10-WGP-NEXT:    s_clause 0x1
2213; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2214; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2215; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2216; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2219; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2220; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2221; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2222; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2223; GFX10-WGP-NEXT:    buffer_gl0_inv
2224; GFX10-WGP-NEXT:    buffer_gl1_inv
2225; GFX10-WGP-NEXT:    s_endpgm
2226;
2227; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg:
2228; GFX10-CU:       ; %bb.0: ; %entry
2229; GFX10-CU-NEXT:    s_clause 0x1
2230; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2231; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2232; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2233; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2235; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2236; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2237; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2238; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2239; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2240; GFX10-CU-NEXT:    buffer_gl0_inv
2241; GFX10-CU-NEXT:    buffer_gl1_inv
2242; GFX10-CU-NEXT:    s_endpgm
2243;
2244; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg:
2245; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2246; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2247; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2248; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2249; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2250; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2252; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2253; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2254; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2255; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2256; SKIP-CACHE-INV-NEXT:    s_endpgm
2257;
2258; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
2259; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2260; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2262; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2263; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2264; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2265; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2266; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2267; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2268; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2269; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2270;
2271; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
2272; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2273; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2275; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2276; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2277; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2278; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2279; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2280; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2281; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2282; GFX90A-TGSPLIT-NEXT:    s_endpgm
2283    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2284entry:
2285  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2286  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire
2287  ret void
2288}
2289
2290define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
2291; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2292; GFX6:       ; %bb.0: ; %entry
2293; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2294; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2295; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2296; GFX6-NEXT:    s_mov_b32 s2, -1
2297; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2298; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2299; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2300; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2301; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2302; GFX6-NEXT:    s_waitcnt vmcnt(0)
2303; GFX6-NEXT:    buffer_wbinvl1
2304; GFX6-NEXT:    s_endpgm
2305;
2306; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2307; GFX7:       ; %bb.0: ; %entry
2308; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2309; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2310; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX7-NEXT:    s_add_u32 s0, s0, 16
2312; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2313; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2314; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2315; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2316; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2317; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2318; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2319; GFX7-NEXT:    s_waitcnt vmcnt(0)
2320; GFX7-NEXT:    buffer_wbinvl1_vol
2321; GFX7-NEXT:    s_endpgm
2322;
2323; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2324; GFX10-WGP:       ; %bb.0: ; %entry
2325; GFX10-WGP-NEXT:    s_clause 0x1
2326; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2327; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2328; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2329; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2331; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2332; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2333; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2334; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2335; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2336; GFX10-WGP-NEXT:    buffer_gl0_inv
2337; GFX10-WGP-NEXT:    buffer_gl1_inv
2338; GFX10-WGP-NEXT:    s_endpgm
2339;
2340; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2341; GFX10-CU:       ; %bb.0: ; %entry
2342; GFX10-CU-NEXT:    s_clause 0x1
2343; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2344; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2345; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2346; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2347; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2348; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2349; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2350; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2351; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2352; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2353; GFX10-CU-NEXT:    buffer_gl0_inv
2354; GFX10-CU-NEXT:    buffer_gl1_inv
2355; GFX10-CU-NEXT:    s_endpgm
2356;
2357; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2358; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2359; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2360; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2361; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2362; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2363; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2364; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2365; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2366; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2367; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2368; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2369; SKIP-CACHE-INV-NEXT:    s_endpgm
2370;
2371; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2372; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2373; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2374; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2375; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2376; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2377; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2378; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2379; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2380; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2381; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2382; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2383;
2384; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
2385; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2386; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2387; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2388; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2389; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2391; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2392; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2393; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2394; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2395; GFX90A-TGSPLIT-NEXT:    s_endpgm
2396    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2397entry:
2398  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2399  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
2400  ret void
2401}
2402
2403define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
2404; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2405; GFX6:       ; %bb.0: ; %entry
2406; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2407; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2408; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2409; GFX6-NEXT:    s_mov_b32 s2, -1
2410; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2411; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2412; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2413; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2414; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2415; GFX6-NEXT:    s_waitcnt vmcnt(0)
2416; GFX6-NEXT:    buffer_wbinvl1
2417; GFX6-NEXT:    s_endpgm
2418;
2419; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2420; GFX7:       ; %bb.0: ; %entry
2421; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2422; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2423; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2424; GFX7-NEXT:    s_add_u32 s0, s0, 16
2425; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2426; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2427; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2428; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2429; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2430; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2431; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2432; GFX7-NEXT:    s_waitcnt vmcnt(0)
2433; GFX7-NEXT:    buffer_wbinvl1_vol
2434; GFX7-NEXT:    s_endpgm
2435;
2436; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2437; GFX10-WGP:       ; %bb.0: ; %entry
2438; GFX10-WGP-NEXT:    s_clause 0x1
2439; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2440; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2441; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2442; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2443; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2444; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2445; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2446; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2447; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2448; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2449; GFX10-WGP-NEXT:    buffer_gl0_inv
2450; GFX10-WGP-NEXT:    buffer_gl1_inv
2451; GFX10-WGP-NEXT:    s_endpgm
2452;
2453; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2454; GFX10-CU:       ; %bb.0: ; %entry
2455; GFX10-CU-NEXT:    s_clause 0x1
2456; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2457; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2458; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2459; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2460; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2461; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2462; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2463; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2464; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2465; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2466; GFX10-CU-NEXT:    buffer_gl0_inv
2467; GFX10-CU-NEXT:    buffer_gl1_inv
2468; GFX10-CU-NEXT:    s_endpgm
2469;
2470; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2471; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2472; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2473; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2474; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2475; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2476; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2477; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2478; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2479; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2480; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2481; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2482; SKIP-CACHE-INV-NEXT:    s_endpgm
2483;
2484; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2485; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2486; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2487; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2488; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2489; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2490; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2491; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2492; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2493; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2494; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2495; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2496;
2497; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
2498; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2499; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2500; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2501; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2502; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2503; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2504; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2505; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2506; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2507; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2508; GFX90A-TGSPLIT-NEXT:    s_endpgm
2509    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2510entry:
2511  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2512  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
2513  ret void
2514}
2515
2516define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
2517; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2518; GFX6:       ; %bb.0: ; %entry
2519; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2520; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2521; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2522; GFX6-NEXT:    s_mov_b32 s2, -1
2523; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2525; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2526; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2527; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2528; GFX6-NEXT:    s_waitcnt vmcnt(0)
2529; GFX6-NEXT:    buffer_wbinvl1
2530; GFX6-NEXT:    s_endpgm
2531;
2532; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2533; GFX7:       ; %bb.0: ; %entry
2534; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2535; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2536; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2537; GFX7-NEXT:    s_add_u32 s0, s0, 16
2538; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2539; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2540; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2541; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2542; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2543; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2544; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2545; GFX7-NEXT:    s_waitcnt vmcnt(0)
2546; GFX7-NEXT:    buffer_wbinvl1_vol
2547; GFX7-NEXT:    s_endpgm
2548;
2549; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2550; GFX10-WGP:       ; %bb.0: ; %entry
2551; GFX10-WGP-NEXT:    s_clause 0x1
2552; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2553; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2554; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2555; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2556; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2557; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2558; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2559; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2560; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2561; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2562; GFX10-WGP-NEXT:    buffer_gl0_inv
2563; GFX10-WGP-NEXT:    buffer_gl1_inv
2564; GFX10-WGP-NEXT:    s_endpgm
2565;
2566; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2567; GFX10-CU:       ; %bb.0: ; %entry
2568; GFX10-CU-NEXT:    s_clause 0x1
2569; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2570; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2571; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2572; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2573; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2574; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2575; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2576; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2577; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2578; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2579; GFX10-CU-NEXT:    buffer_gl0_inv
2580; GFX10-CU-NEXT:    buffer_gl1_inv
2581; GFX10-CU-NEXT:    s_endpgm
2582;
2583; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2584; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2585; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2586; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2587; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2588; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2589; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2592; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2593; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2594; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2595; SKIP-CACHE-INV-NEXT:    s_endpgm
2596;
2597; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2598; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2599; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2600; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2601; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2602; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2603; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2604; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2605; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2606; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2607; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2608; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2609;
2610; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
2611; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2612; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2613; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2614; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2615; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2616; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2617; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2618; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2619; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2620; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2621; GFX90A-TGSPLIT-NEXT:    s_endpgm
2622    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2623entry:
2624  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2625  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
2626  ret void
2627}
2628
2629define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
2630; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2631; GFX6:       ; %bb.0: ; %entry
2632; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2633; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2634; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2635; GFX6-NEXT:    s_mov_b32 s2, -1
2636; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2637; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2638; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2639; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2640; GFX6-NEXT:    s_waitcnt vmcnt(0)
2641; GFX6-NEXT:    buffer_wbinvl1
2642; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2643; GFX6-NEXT:    s_endpgm
2644;
2645; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2646; GFX7:       ; %bb.0: ; %entry
2647; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2648; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2649; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2650; GFX7-NEXT:    s_add_u32 s4, s0, 16
2651; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2652; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2653; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2654; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2655; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2656; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2657; GFX7-NEXT:    s_waitcnt vmcnt(0)
2658; GFX7-NEXT:    buffer_wbinvl1_vol
2659; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2660; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2661; GFX7-NEXT:    flat_store_dword v[0:1], v2
2662; GFX7-NEXT:    s_endpgm
2663;
2664; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2665; GFX10-WGP:       ; %bb.0: ; %entry
2666; GFX10-WGP-NEXT:    s_clause 0x1
2667; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2668; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2669; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2670; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2671; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2672; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2673; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2674; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2675; GFX10-WGP-NEXT:    buffer_gl0_inv
2676; GFX10-WGP-NEXT:    buffer_gl1_inv
2677; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2678; GFX10-WGP-NEXT:    s_endpgm
2679;
2680; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2681; GFX10-CU:       ; %bb.0: ; %entry
2682; GFX10-CU-NEXT:    s_clause 0x1
2683; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2684; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2685; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2686; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2687; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2688; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2689; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2690; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2691; GFX10-CU-NEXT:    buffer_gl0_inv
2692; GFX10-CU-NEXT:    buffer_gl1_inv
2693; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2694; GFX10-CU-NEXT:    s_endpgm
2695;
2696; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2697; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2698; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2699; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2700; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2701; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2702; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2703; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2705; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2706; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2707; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2708; SKIP-CACHE-INV-NEXT:    s_endpgm
2709;
2710; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2711; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2712; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2713; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2714; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2715; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2716; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2717; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2718; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2719; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2720; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2721; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2722;
2723; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
2724; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2725; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2726; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2727; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2728; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2729; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2730; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2731; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2732; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2733; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2734; GFX90A-TGSPLIT-NEXT:    s_endpgm
2735    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2736entry:
2737  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2738  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
2739  %val0 = extractvalue { i32, i1 } %val, 0
2740  store i32 %val0, i32 addrspace(1)* %out, align 4
2741  ret void
2742}
2743
2744define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
2745; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2746; GFX6:       ; %bb.0: ; %entry
2747; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2748; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2749; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2750; GFX6-NEXT:    s_mov_b32 s2, -1
2751; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2752; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2753; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2754; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2755; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2756; GFX6-NEXT:    s_waitcnt vmcnt(0)
2757; GFX6-NEXT:    buffer_wbinvl1
2758; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2759; GFX6-NEXT:    s_endpgm
2760;
2761; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2762; GFX7:       ; %bb.0: ; %entry
2763; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2764; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2765; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX7-NEXT:    s_add_u32 s4, s0, 16
2767; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2768; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2769; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2770; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2771; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2772; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2773; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2774; GFX7-NEXT:    s_waitcnt vmcnt(0)
2775; GFX7-NEXT:    buffer_wbinvl1_vol
2776; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2777; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2778; GFX7-NEXT:    flat_store_dword v[0:1], v2
2779; GFX7-NEXT:    s_endpgm
2780;
2781; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2782; GFX10-WGP:       ; %bb.0: ; %entry
2783; GFX10-WGP-NEXT:    s_clause 0x1
2784; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2785; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2786; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2787; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2788; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2789; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2790; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2791; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2792; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2793; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2794; GFX10-WGP-NEXT:    buffer_gl0_inv
2795; GFX10-WGP-NEXT:    buffer_gl1_inv
2796; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2797; GFX10-WGP-NEXT:    s_endpgm
2798;
2799; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2800; GFX10-CU:       ; %bb.0: ; %entry
2801; GFX10-CU-NEXT:    s_clause 0x1
2802; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2803; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2804; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2805; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2807; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2808; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2809; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2810; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2811; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2812; GFX10-CU-NEXT:    buffer_gl0_inv
2813; GFX10-CU-NEXT:    buffer_gl1_inv
2814; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2815; GFX10-CU-NEXT:    s_endpgm
2816;
2817; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2818; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2819; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2820; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2821; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2822; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2823; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2824; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2825; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2826; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2827; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2828; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2829; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2830; SKIP-CACHE-INV-NEXT:    s_endpgm
2831;
2832; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2833; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2834; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2835; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2836; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2837; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2838; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2839; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2840; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2841; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2842; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2843; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2844; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2845;
2846; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
2847; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2848; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2849; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2850; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2851; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2852; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2853; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2854; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2855; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2856; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2857; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2858; GFX90A-TGSPLIT-NEXT:    s_endpgm
2859    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2860entry:
2861  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2862  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
2863  %val0 = extractvalue { i32, i1 } %val, 0
2864  store i32 %val0, i32 addrspace(1)* %out, align 4
2865  ret void
2866}
2867
2868define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
2869; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2870; GFX6:       ; %bb.0: ; %entry
2871; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2872; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2873; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2874; GFX6-NEXT:    s_mov_b32 s2, -1
2875; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2876; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2877; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2878; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2879; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2880; GFX6-NEXT:    s_waitcnt vmcnt(0)
2881; GFX6-NEXT:    buffer_wbinvl1
2882; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2883; GFX6-NEXT:    s_endpgm
2884;
2885; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2886; GFX7:       ; %bb.0: ; %entry
2887; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2888; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2889; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2890; GFX7-NEXT:    s_add_u32 s4, s0, 16
2891; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2892; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2893; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2894; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2895; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2896; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2897; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2898; GFX7-NEXT:    s_waitcnt vmcnt(0)
2899; GFX7-NEXT:    buffer_wbinvl1_vol
2900; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2901; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2902; GFX7-NEXT:    flat_store_dword v[0:1], v2
2903; GFX7-NEXT:    s_endpgm
2904;
2905; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2906; GFX10-WGP:       ; %bb.0: ; %entry
2907; GFX10-WGP-NEXT:    s_clause 0x1
2908; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2909; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2910; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2911; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2912; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2913; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2914; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2915; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2916; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2917; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2918; GFX10-WGP-NEXT:    buffer_gl0_inv
2919; GFX10-WGP-NEXT:    buffer_gl1_inv
2920; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2921; GFX10-WGP-NEXT:    s_endpgm
2922;
2923; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2924; GFX10-CU:       ; %bb.0: ; %entry
2925; GFX10-CU-NEXT:    s_clause 0x1
2926; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2927; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2928; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2929; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2930; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2931; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2932; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2933; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2934; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2935; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2936; GFX10-CU-NEXT:    buffer_gl0_inv
2937; GFX10-CU-NEXT:    buffer_gl1_inv
2938; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2939; GFX10-CU-NEXT:    s_endpgm
2940;
2941; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2942; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2943; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2944; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2945; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2946; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2947; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2948; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2949; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2950; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2951; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2952; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2953; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2954; SKIP-CACHE-INV-NEXT:    s_endpgm
2955;
2956; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2957; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2958; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2959; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2960; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2961; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2962; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2963; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2964; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2965; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2966; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2967; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2968; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2969;
2970; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
2971; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2972; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2973; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2974; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2975; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2976; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2977; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2978; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2979; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2980; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2981; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2982; GFX90A-TGSPLIT-NEXT:    s_endpgm
2983    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2984entry:
2985  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2986  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
2987  %val0 = extractvalue { i32, i1 } %val, 0
2988  store i32 %val0, i32 addrspace(1)* %out, align 4
2989  ret void
2990}
2991
2992define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
2993; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
2994; GFX6:       ; %bb.0: ; %entry
2995; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2996; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2997; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2998; GFX6-NEXT:    s_mov_b32 s2, -1
2999; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3000; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3001; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3002; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3003; GFX6-NEXT:    s_waitcnt vmcnt(0)
3004; GFX6-NEXT:    buffer_wbinvl1
3005; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3006; GFX6-NEXT:    s_endpgm
3007;
3008; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3009; GFX7:       ; %bb.0: ; %entry
3010; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3011; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3013; GFX7-NEXT:    s_add_u32 s4, s0, 16
3014; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3015; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3016; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3017; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3018; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3019; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3020; GFX7-NEXT:    s_waitcnt vmcnt(0)
3021; GFX7-NEXT:    buffer_wbinvl1_vol
3022; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3023; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3024; GFX7-NEXT:    flat_store_dword v[0:1], v2
3025; GFX7-NEXT:    s_endpgm
3026;
3027; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3028; GFX10-WGP:       ; %bb.0: ; %entry
3029; GFX10-WGP-NEXT:    s_clause 0x1
3030; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3031; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3032; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3033; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3035; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3036; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3037; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3038; GFX10-WGP-NEXT:    buffer_gl0_inv
3039; GFX10-WGP-NEXT:    buffer_gl1_inv
3040; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3041; GFX10-WGP-NEXT:    s_endpgm
3042;
3043; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3044; GFX10-CU:       ; %bb.0: ; %entry
3045; GFX10-CU-NEXT:    s_clause 0x1
3046; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3047; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3048; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3049; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3051; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3052; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3053; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3054; GFX10-CU-NEXT:    buffer_gl0_inv
3055; GFX10-CU-NEXT:    buffer_gl1_inv
3056; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3057; GFX10-CU-NEXT:    s_endpgm
3058;
3059; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3060; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3061; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3062; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3063; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3064; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3065; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3066; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3068; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3069; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3070; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3071; SKIP-CACHE-INV-NEXT:    s_endpgm
3072;
3073; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3074; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3075; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3076; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3077; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3078; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3079; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3080; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3081; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3082; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3083; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3084; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3085;
3086; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
3087; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3088; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3089; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3090; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3091; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3092; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3093; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3094; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3095; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3096; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3097; GFX90A-TGSPLIT-NEXT:    s_endpgm
3098    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3099entry:
3100  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3101  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
3102  %val0 = extractvalue { i32, i1 } %val, 0
3103  store i32 %val0, i32 addrspace(1)* %out, align 4
3104  ret void
3105}
3106
3107define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
3108; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg:
3109; GFX6:       ; %bb.0: ; %entry
3110; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3111; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3112; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3113; GFX6-NEXT:    s_mov_b32 s2, -1
3114; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3115; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3116; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3117; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3118; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3119; GFX6-NEXT:    s_waitcnt vmcnt(0)
3120; GFX6-NEXT:    buffer_wbinvl1
3121; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3122; GFX6-NEXT:    s_endpgm
3123;
3124; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
3125; GFX7:       ; %bb.0: ; %entry
3126; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3127; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3128; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3129; GFX7-NEXT:    s_add_u32 s4, s0, 16
3130; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3131; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3132; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3133; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3134; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3135; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3136; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3137; GFX7-NEXT:    s_waitcnt vmcnt(0)
3138; GFX7-NEXT:    buffer_wbinvl1_vol
3139; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3140; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3141; GFX7-NEXT:    flat_store_dword v[0:1], v2
3142; GFX7-NEXT:    s_endpgm
3143;
3144; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
3145; GFX10-WGP:       ; %bb.0: ; %entry
3146; GFX10-WGP-NEXT:    s_clause 0x1
3147; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3148; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3149; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3150; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3151; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3152; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3153; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3154; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3155; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3156; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3157; GFX10-WGP-NEXT:    buffer_gl0_inv
3158; GFX10-WGP-NEXT:    buffer_gl1_inv
3159; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3160; GFX10-WGP-NEXT:    s_endpgm
3161;
3162; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
3163; GFX10-CU:       ; %bb.0: ; %entry
3164; GFX10-CU-NEXT:    s_clause 0x1
3165; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3166; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3167; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3168; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3169; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3170; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3171; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3172; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3173; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3174; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3175; GFX10-CU-NEXT:    buffer_gl0_inv
3176; GFX10-CU-NEXT:    buffer_gl1_inv
3177; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3178; GFX10-CU-NEXT:    s_endpgm
3179;
3180; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg:
3181; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3182; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3183; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3184; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3185; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3186; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3187; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3189; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3190; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3191; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3192; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3193; SKIP-CACHE-INV-NEXT:    s_endpgm
3194;
3195; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
3196; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3198; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3199; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3200; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3201; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3203; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3204; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3205; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3206; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3207; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3208;
3209; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
3210; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3211; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3212; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3213; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3214; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3215; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3216; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3217; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3218; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3219; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3220; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3221; GFX90A-TGSPLIT-NEXT:    s_endpgm
3222    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3223entry:
3224  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3225  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire
3226  %val0 = extractvalue { i32, i1 } %val, 0
3227  store i32 %val0, i32 addrspace(1)* %out, align 4
3228  ret void
3229}
3230
3231define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
3232; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3233; GFX6:       ; %bb.0: ; %entry
3234; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3235; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3236; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3237; GFX6-NEXT:    s_mov_b32 s2, -1
3238; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3239; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3240; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3241; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3242; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3243; GFX6-NEXT:    s_waitcnt vmcnt(0)
3244; GFX6-NEXT:    buffer_wbinvl1
3245; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3246; GFX6-NEXT:    s_endpgm
3247;
3248; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3249; GFX7:       ; %bb.0: ; %entry
3250; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3251; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3252; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3253; GFX7-NEXT:    s_add_u32 s4, s0, 16
3254; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3255; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3256; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3257; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3258; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3259; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3260; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3261; GFX7-NEXT:    s_waitcnt vmcnt(0)
3262; GFX7-NEXT:    buffer_wbinvl1_vol
3263; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3264; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3265; GFX7-NEXT:    flat_store_dword v[0:1], v2
3266; GFX7-NEXT:    s_endpgm
3267;
3268; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3269; GFX10-WGP:       ; %bb.0: ; %entry
3270; GFX10-WGP-NEXT:    s_clause 0x1
3271; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3272; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3273; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3274; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3275; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3276; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3277; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3278; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3279; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3280; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3281; GFX10-WGP-NEXT:    buffer_gl0_inv
3282; GFX10-WGP-NEXT:    buffer_gl1_inv
3283; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3284; GFX10-WGP-NEXT:    s_endpgm
3285;
3286; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3287; GFX10-CU:       ; %bb.0: ; %entry
3288; GFX10-CU-NEXT:    s_clause 0x1
3289; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3290; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3291; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3292; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3294; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3295; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3296; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3297; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3298; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3299; GFX10-CU-NEXT:    buffer_gl0_inv
3300; GFX10-CU-NEXT:    buffer_gl1_inv
3301; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3302; GFX10-CU-NEXT:    s_endpgm
3303;
3304; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3305; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3306; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3307; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3309; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3310; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3313; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3314; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3315; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3316; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3317; SKIP-CACHE-INV-NEXT:    s_endpgm
3318;
3319; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3320; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3321; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3322; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3323; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3324; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3325; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3326; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3327; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3328; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3329; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3330; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3331; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3332;
3333; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
3334; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3335; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3336; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3337; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3338; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3339; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3340; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3341; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3342; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3343; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3344; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3345; GFX90A-TGSPLIT-NEXT:    s_endpgm
3346    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3347entry:
3348  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3349  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
3350  %val0 = extractvalue { i32, i1 } %val, 0
3351  store i32 %val0, i32 addrspace(1)* %out, align 4
3352  ret void
3353}
3354
3355define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
3356; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3357; GFX6:       ; %bb.0: ; %entry
3358; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3359; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3360; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3361; GFX6-NEXT:    s_mov_b32 s2, -1
3362; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3363; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3364; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3365; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3366; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3367; GFX6-NEXT:    s_waitcnt vmcnt(0)
3368; GFX6-NEXT:    buffer_wbinvl1
3369; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3370; GFX6-NEXT:    s_endpgm
3371;
3372; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3373; GFX7:       ; %bb.0: ; %entry
3374; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3375; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3377; GFX7-NEXT:    s_add_u32 s4, s0, 16
3378; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3379; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3380; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3381; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3382; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3383; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3384; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3385; GFX7-NEXT:    s_waitcnt vmcnt(0)
3386; GFX7-NEXT:    buffer_wbinvl1_vol
3387; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3388; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3389; GFX7-NEXT:    flat_store_dword v[0:1], v2
3390; GFX7-NEXT:    s_endpgm
3391;
3392; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3393; GFX10-WGP:       ; %bb.0: ; %entry
3394; GFX10-WGP-NEXT:    s_clause 0x1
3395; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3396; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3397; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3398; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3399; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3400; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3401; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3402; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3403; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3404; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3405; GFX10-WGP-NEXT:    buffer_gl0_inv
3406; GFX10-WGP-NEXT:    buffer_gl1_inv
3407; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3408; GFX10-WGP-NEXT:    s_endpgm
3409;
3410; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3411; GFX10-CU:       ; %bb.0: ; %entry
3412; GFX10-CU-NEXT:    s_clause 0x1
3413; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3414; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3415; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3416; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3418; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3419; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3420; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3421; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3422; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3423; GFX10-CU-NEXT:    buffer_gl0_inv
3424; GFX10-CU-NEXT:    buffer_gl1_inv
3425; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3426; GFX10-CU-NEXT:    s_endpgm
3427;
3428; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3429; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3431; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3432; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3433; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3434; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3437; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3438; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3439; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3440; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3441; SKIP-CACHE-INV-NEXT:    s_endpgm
3442;
3443; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3444; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3445; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3446; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3447; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3448; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3449; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3450; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3451; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3452; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3453; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3454; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3455; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3456;
3457; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
3458; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3459; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3460; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3461; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3462; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3464; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3465; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3466; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3467; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3468; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3469; GFX90A-TGSPLIT-NEXT:    s_endpgm
3470    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3471entry:
3472  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3473  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
3474  %val0 = extractvalue { i32, i1 } %val, 0
3475  store i32 %val0, i32 addrspace(1)* %out, align 4
3476  ret void
3477}
3478
3479define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
3480; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3481; GFX6:       ; %bb.0: ; %entry
3482; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3483; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3484; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3485; GFX6-NEXT:    s_mov_b32 s2, -1
3486; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3487; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3488; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3489; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3490; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3491; GFX6-NEXT:    s_waitcnt vmcnt(0)
3492; GFX6-NEXT:    buffer_wbinvl1
3493; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3494; GFX6-NEXT:    s_endpgm
3495;
3496; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3497; GFX7:       ; %bb.0: ; %entry
3498; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3499; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3500; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3501; GFX7-NEXT:    s_add_u32 s4, s0, 16
3502; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3503; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3504; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3505; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3506; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3507; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3508; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3509; GFX7-NEXT:    s_waitcnt vmcnt(0)
3510; GFX7-NEXT:    buffer_wbinvl1_vol
3511; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3512; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3513; GFX7-NEXT:    flat_store_dword v[0:1], v2
3514; GFX7-NEXT:    s_endpgm
3515;
3516; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3517; GFX10-WGP:       ; %bb.0: ; %entry
3518; GFX10-WGP-NEXT:    s_clause 0x1
3519; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3520; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3521; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3522; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3523; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3524; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3525; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3526; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3527; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3528; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3529; GFX10-WGP-NEXT:    buffer_gl0_inv
3530; GFX10-WGP-NEXT:    buffer_gl1_inv
3531; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3532; GFX10-WGP-NEXT:    s_endpgm
3533;
3534; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3535; GFX10-CU:       ; %bb.0: ; %entry
3536; GFX10-CU-NEXT:    s_clause 0x1
3537; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3538; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3539; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3540; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3541; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3542; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3543; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3544; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3545; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3546; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3547; GFX10-CU-NEXT:    buffer_gl0_inv
3548; GFX10-CU-NEXT:    buffer_gl1_inv
3549; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3550; GFX10-CU-NEXT:    s_endpgm
3551;
3552; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3553; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3554; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3555; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3556; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3557; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3558; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3561; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3562; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3563; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3564; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3565; SKIP-CACHE-INV-NEXT:    s_endpgm
3566;
3567; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3568; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3569; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3570; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3571; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3572; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3573; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3574; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3575; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3576; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3577; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3578; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3579; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3580;
3581; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
3582; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3583; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3584; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3585; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3586; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3587; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3588; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3589; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3590; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3591; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3592; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3593; GFX90A-TGSPLIT-NEXT:    s_endpgm
3594    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3595entry:
3596  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3597  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
3598  %val0 = extractvalue { i32, i1 } %val, 0
3599  store i32 %val0, i32 addrspace(1)* %out, align 4
3600  ret void
3601}
3602
3603define amdgpu_kernel void @global_agent_one_as_unordered_load(
3604; GFX6-LABEL: global_agent_one_as_unordered_load:
3605; GFX6:       ; %bb.0: ; %entry
3606; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3607; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3608; GFX6-NEXT:    s_mov_b32 s2, -1
3609; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3610; GFX6-NEXT:    s_mov_b32 s0, s4
3611; GFX6-NEXT:    s_mov_b32 s1, s5
3612; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
3613; GFX6-NEXT:    s_mov_b32 s4, s6
3614; GFX6-NEXT:    s_mov_b32 s5, s7
3615; GFX6-NEXT:    s_mov_b32 s6, s2
3616; GFX6-NEXT:    s_mov_b32 s7, s3
3617; GFX6-NEXT:    s_waitcnt vmcnt(0)
3618; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3619; GFX6-NEXT:    s_endpgm
3620;
3621; GFX7-LABEL: global_agent_one_as_unordered_load:
3622; GFX7:       ; %bb.0: ; %entry
3623; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3624; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3625; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3626; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3627; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3628; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3629; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3630; GFX7-NEXT:    s_waitcnt vmcnt(0)
3631; GFX7-NEXT:    flat_store_dword v[2:3], v0
3632; GFX7-NEXT:    s_endpgm
3633;
3634; GFX10-WGP-LABEL: global_agent_one_as_unordered_load:
3635; GFX10-WGP:       ; %bb.0: ; %entry
3636; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3637; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3638; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3639; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
3640; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3641; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3642; GFX10-WGP-NEXT:    s_endpgm
3643;
3644; GFX10-CU-LABEL: global_agent_one_as_unordered_load:
3645; GFX10-CU:       ; %bb.0: ; %entry
3646; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3647; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3648; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3649; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
3650; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3651; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3652; GFX10-CU-NEXT:    s_endpgm
3653;
3654; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load:
3655; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3656; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3657; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3658; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3659; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3660; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3661; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3662; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
3663; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3664; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3665; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3666; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3667; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3668; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3669; SKIP-CACHE-INV-NEXT:    s_endpgm
3670;
3671; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load:
3672; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3673; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3674; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3675; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
3677; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3678; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3679; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3680;
3681; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load:
3682; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3683; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3684; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3685; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3686; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
3687; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3688; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3689; GFX90A-TGSPLIT-NEXT:    s_endpgm
3690    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3691entry:
3692  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4
3693  store i32 %val, i32 addrspace(1)* %out
3694  ret void
3695}
3696
3697define amdgpu_kernel void @global_agent_one_as_monotonic_load(
3698; GFX6-LABEL: global_agent_one_as_monotonic_load:
3699; GFX6:       ; %bb.0: ; %entry
3700; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3701; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3702; GFX6-NEXT:    s_mov_b32 s2, -1
3703; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3704; GFX6-NEXT:    s_mov_b32 s0, s4
3705; GFX6-NEXT:    s_mov_b32 s1, s5
3706; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3707; GFX6-NEXT:    s_mov_b32 s4, s6
3708; GFX6-NEXT:    s_mov_b32 s5, s7
3709; GFX6-NEXT:    s_mov_b32 s6, s2
3710; GFX6-NEXT:    s_mov_b32 s7, s3
3711; GFX6-NEXT:    s_waitcnt vmcnt(0)
3712; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3713; GFX6-NEXT:    s_endpgm
3714;
3715; GFX7-LABEL: global_agent_one_as_monotonic_load:
3716; GFX7:       ; %bb.0: ; %entry
3717; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3718; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3719; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3720; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3721; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3722; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3723; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3724; GFX7-NEXT:    s_waitcnt vmcnt(0)
3725; GFX7-NEXT:    flat_store_dword v[2:3], v0
3726; GFX7-NEXT:    s_endpgm
3727;
3728; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load:
3729; GFX10-WGP:       ; %bb.0: ; %entry
3730; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3731; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3732; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3733; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3734; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3735; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3736; GFX10-WGP-NEXT:    s_endpgm
3737;
3738; GFX10-CU-LABEL: global_agent_one_as_monotonic_load:
3739; GFX10-CU:       ; %bb.0: ; %entry
3740; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3741; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3742; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3743; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3744; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3745; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3746; GFX10-CU-NEXT:    s_endpgm
3747;
3748; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load:
3749; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3750; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3752; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3753; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3754; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3755; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3756; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3757; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3758; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3759; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3760; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3761; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3762; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3763; SKIP-CACHE-INV-NEXT:    s_endpgm
3764;
3765; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load:
3766; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3767; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3768; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3769; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3771; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3772; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3773; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3774;
3775; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load:
3776; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3777; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3778; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3779; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3780; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3781; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3782; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3783; GFX90A-TGSPLIT-NEXT:    s_endpgm
3784    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3785entry:
3786  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4
3787  store i32 %val, i32 addrspace(1)* %out
3788  ret void
3789}
3790
3791define amdgpu_kernel void @global_agent_one_as_acquire_load(
3792; GFX6-LABEL: global_agent_one_as_acquire_load:
3793; GFX6:       ; %bb.0: ; %entry
3794; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3795; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3796; GFX6-NEXT:    s_mov_b32 s2, -1
3797; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3798; GFX6-NEXT:    s_mov_b32 s0, s4
3799; GFX6-NEXT:    s_mov_b32 s1, s5
3800; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3801; GFX6-NEXT:    s_waitcnt vmcnt(0)
3802; GFX6-NEXT:    buffer_wbinvl1
3803; GFX6-NEXT:    s_mov_b32 s4, s6
3804; GFX6-NEXT:    s_mov_b32 s5, s7
3805; GFX6-NEXT:    s_mov_b32 s6, s2
3806; GFX6-NEXT:    s_mov_b32 s7, s3
3807; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3808; GFX6-NEXT:    s_endpgm
3809;
3810; GFX7-LABEL: global_agent_one_as_acquire_load:
3811; GFX7:       ; %bb.0: ; %entry
3812; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3813; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3814; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3815; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3816; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3817; GFX7-NEXT:    s_waitcnt vmcnt(0)
3818; GFX7-NEXT:    buffer_wbinvl1_vol
3819; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3820; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3821; GFX7-NEXT:    flat_store_dword v[2:3], v0
3822; GFX7-NEXT:    s_endpgm
3823;
3824; GFX10-WGP-LABEL: global_agent_one_as_acquire_load:
3825; GFX10-WGP:       ; %bb.0: ; %entry
3826; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3827; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3828; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3829; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3830; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3831; GFX10-WGP-NEXT:    buffer_gl0_inv
3832; GFX10-WGP-NEXT:    buffer_gl1_inv
3833; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3834; GFX10-WGP-NEXT:    s_endpgm
3835;
3836; GFX10-CU-LABEL: global_agent_one_as_acquire_load:
3837; GFX10-CU:       ; %bb.0: ; %entry
3838; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3839; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3840; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3841; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3842; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3843; GFX10-CU-NEXT:    buffer_gl0_inv
3844; GFX10-CU-NEXT:    buffer_gl1_inv
3845; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3846; GFX10-CU-NEXT:    s_endpgm
3847;
3848; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load:
3849; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3850; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3852; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3853; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3854; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3855; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3856; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3857; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3858; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3859; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3860; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3861; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3862; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3863; SKIP-CACHE-INV-NEXT:    s_endpgm
3864;
3865; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load:
3866; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3867; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3868; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3870; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3871; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3872; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3873; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3874; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3875;
3876; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load:
3877; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3878; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3879; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3880; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3881; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3882; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3883; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3884; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3885; GFX90A-TGSPLIT-NEXT:    s_endpgm
3886    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3887entry:
3888  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4
3889  store i32 %val, i32 addrspace(1)* %out
3890  ret void
3891}
3892
3893define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
3894; GFX6-LABEL: global_agent_one_as_seq_cst_load:
3895; GFX6:       ; %bb.0: ; %entry
3896; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3897; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3898; GFX6-NEXT:    s_mov_b32 s2, -1
3899; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3900; GFX6-NEXT:    s_mov_b32 s0, s4
3901; GFX6-NEXT:    s_mov_b32 s1, s5
3902; GFX6-NEXT:    s_waitcnt vmcnt(0)
3903; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3904; GFX6-NEXT:    s_waitcnt vmcnt(0)
3905; GFX6-NEXT:    buffer_wbinvl1
3906; GFX6-NEXT:    s_mov_b32 s4, s6
3907; GFX6-NEXT:    s_mov_b32 s5, s7
3908; GFX6-NEXT:    s_mov_b32 s6, s2
3909; GFX6-NEXT:    s_mov_b32 s7, s3
3910; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3911; GFX6-NEXT:    s_endpgm
3912;
3913; GFX7-LABEL: global_agent_one_as_seq_cst_load:
3914; GFX7:       ; %bb.0: ; %entry
3915; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3917; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3918; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3919; GFX7-NEXT:    s_waitcnt vmcnt(0)
3920; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3921; GFX7-NEXT:    s_waitcnt vmcnt(0)
3922; GFX7-NEXT:    buffer_wbinvl1_vol
3923; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3924; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3925; GFX7-NEXT:    flat_store_dword v[2:3], v0
3926; GFX7-NEXT:    s_endpgm
3927;
3928; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load:
3929; GFX10-WGP:       ; %bb.0: ; %entry
3930; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3931; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3932; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3933; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3934; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3935; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3936; GFX10-WGP-NEXT:    buffer_gl0_inv
3937; GFX10-WGP-NEXT:    buffer_gl1_inv
3938; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3939; GFX10-WGP-NEXT:    s_endpgm
3940;
3941; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load:
3942; GFX10-CU:       ; %bb.0: ; %entry
3943; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3944; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3945; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3946; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3947; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3948; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3949; GFX10-CU-NEXT:    buffer_gl0_inv
3950; GFX10-CU-NEXT:    buffer_gl1_inv
3951; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3952; GFX10-CU-NEXT:    s_endpgm
3953;
3954; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load:
3955; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3956; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3957; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3958; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3959; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3960; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3961; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3962; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3963; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3964; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3965; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3966; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3967; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3968; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3969; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3970; SKIP-CACHE-INV-NEXT:    s_endpgm
3971;
3972; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
3973; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3974; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3975; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3976; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3977; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3978; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3979; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3980; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3981; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3982;
3983; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
3984; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3985; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3986; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3987; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3988; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3989; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3990; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3991; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3992; GFX90A-TGSPLIT-NEXT:    s_endpgm
3993    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3994entry:
3995  %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4
3996  store i32 %val, i32 addrspace(1)* %out
3997  ret void
3998}
3999
4000define amdgpu_kernel void @global_agent_one_as_unordered_store(
4001; GFX6-LABEL: global_agent_one_as_unordered_store:
4002; GFX6:       ; %bb.0: ; %entry
4003; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4004; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4005; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4006; GFX6-NEXT:    s_mov_b32 s2, -1
4007; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4008; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4009; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4010; GFX6-NEXT:    s_endpgm
4011;
4012; GFX7-LABEL: global_agent_one_as_unordered_store:
4013; GFX7:       ; %bb.0: ; %entry
4014; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4015; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4016; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4017; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4018; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4019; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4020; GFX7-NEXT:    flat_store_dword v[0:1], v2
4021; GFX7-NEXT:    s_endpgm
4022;
4023; GFX10-WGP-LABEL: global_agent_one_as_unordered_store:
4024; GFX10-WGP:       ; %bb.0: ; %entry
4025; GFX10-WGP-NEXT:    s_clause 0x1
4026; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4027; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4028; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4029; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4030; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4031; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4032; GFX10-WGP-NEXT:    s_endpgm
4033;
4034; GFX10-CU-LABEL: global_agent_one_as_unordered_store:
4035; GFX10-CU:       ; %bb.0: ; %entry
4036; GFX10-CU-NEXT:    s_clause 0x1
4037; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4038; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4039; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4040; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4041; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4042; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4043; GFX10-CU-NEXT:    s_endpgm
4044;
4045; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store:
4046; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4047; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4048; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4049; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4050; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4051; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4053; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4054; SKIP-CACHE-INV-NEXT:    s_endpgm
4055;
4056; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store:
4057; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4058; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4059; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4060; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4061; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4063; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4064; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4065;
4066; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store:
4067; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4068; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4069; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4070; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4071; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4072; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4073; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4074; GFX90A-TGSPLIT-NEXT:    s_endpgm
4075    i32 %in, i32 addrspace(1)* %out) {
4076entry:
4077  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4
4078  ret void
4079}
4080
4081define amdgpu_kernel void @global_agent_one_as_monotonic_store(
4082; GFX6-LABEL: global_agent_one_as_monotonic_store:
4083; GFX6:       ; %bb.0: ; %entry
4084; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4085; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4086; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4087; GFX6-NEXT:    s_mov_b32 s2, -1
4088; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4089; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4090; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4091; GFX6-NEXT:    s_endpgm
4092;
4093; GFX7-LABEL: global_agent_one_as_monotonic_store:
4094; GFX7:       ; %bb.0: ; %entry
4095; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4096; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4097; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4098; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4099; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4100; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4101; GFX7-NEXT:    flat_store_dword v[0:1], v2
4102; GFX7-NEXT:    s_endpgm
4103;
4104; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store:
4105; GFX10-WGP:       ; %bb.0: ; %entry
4106; GFX10-WGP-NEXT:    s_clause 0x1
4107; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4108; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4109; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4110; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4111; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4112; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4113; GFX10-WGP-NEXT:    s_endpgm
4114;
4115; GFX10-CU-LABEL: global_agent_one_as_monotonic_store:
4116; GFX10-CU:       ; %bb.0: ; %entry
4117; GFX10-CU-NEXT:    s_clause 0x1
4118; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4119; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4120; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4121; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4122; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4123; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4124; GFX10-CU-NEXT:    s_endpgm
4125;
4126; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store:
4127; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4128; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4129; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4130; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4131; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4132; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4133; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4134; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4135; SKIP-CACHE-INV-NEXT:    s_endpgm
4136;
4137; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store:
4138; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4139; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4140; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4141; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4142; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4143; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4144; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4145; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4146;
4147; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store:
4148; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4149; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4150; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4151; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4152; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4153; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4154; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4155; GFX90A-TGSPLIT-NEXT:    s_endpgm
4156    i32 %in, i32 addrspace(1)* %out) {
4157entry:
4158  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4
4159  ret void
4160}
4161
4162define amdgpu_kernel void @global_agent_one_as_release_store(
4163; GFX6-LABEL: global_agent_one_as_release_store:
4164; GFX6:       ; %bb.0: ; %entry
4165; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4166; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4167; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4168; GFX6-NEXT:    s_mov_b32 s2, -1
4169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4170; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4171; GFX6-NEXT:    s_waitcnt vmcnt(0)
4172; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4173; GFX6-NEXT:    s_endpgm
4174;
4175; GFX7-LABEL: global_agent_one_as_release_store:
4176; GFX7:       ; %bb.0: ; %entry
4177; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4178; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4179; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4180; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4181; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4182; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4183; GFX7-NEXT:    s_waitcnt vmcnt(0)
4184; GFX7-NEXT:    flat_store_dword v[0:1], v2
4185; GFX7-NEXT:    s_endpgm
4186;
4187; GFX10-WGP-LABEL: global_agent_one_as_release_store:
4188; GFX10-WGP:       ; %bb.0: ; %entry
4189; GFX10-WGP-NEXT:    s_clause 0x1
4190; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4191; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4192; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4193; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4194; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4195; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4196; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4197; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4198; GFX10-WGP-NEXT:    s_endpgm
4199;
4200; GFX10-CU-LABEL: global_agent_one_as_release_store:
4201; GFX10-CU:       ; %bb.0: ; %entry
4202; GFX10-CU-NEXT:    s_clause 0x1
4203; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4204; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4205; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4206; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4207; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4208; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4209; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4210; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4211; GFX10-CU-NEXT:    s_endpgm
4212;
4213; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store:
4214; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4215; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4216; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4217; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4218; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4219; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4220; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4221; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4222; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4223; SKIP-CACHE-INV-NEXT:    s_endpgm
4224;
4225; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store:
4226; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4227; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4228; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4229; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4230; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4231; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4232; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4233; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4234; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4235;
4236; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store:
4237; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4238; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4239; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4240; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4241; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4242; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4243; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4244; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4245; GFX90A-TGSPLIT-NEXT:    s_endpgm
4246    i32 %in, i32 addrspace(1)* %out) {
4247entry:
4248  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4
4249  ret void
4250}
4251
4252define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
4253; GFX6-LABEL: global_agent_one_as_seq_cst_store:
4254; GFX6:       ; %bb.0: ; %entry
4255; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4256; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4257; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4258; GFX6-NEXT:    s_mov_b32 s2, -1
4259; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4260; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4261; GFX6-NEXT:    s_waitcnt vmcnt(0)
4262; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4263; GFX6-NEXT:    s_endpgm
4264;
4265; GFX7-LABEL: global_agent_one_as_seq_cst_store:
4266; GFX7:       ; %bb.0: ; %entry
4267; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4268; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4269; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4270; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4271; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4272; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4273; GFX7-NEXT:    s_waitcnt vmcnt(0)
4274; GFX7-NEXT:    flat_store_dword v[0:1], v2
4275; GFX7-NEXT:    s_endpgm
4276;
4277; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store:
4278; GFX10-WGP:       ; %bb.0: ; %entry
4279; GFX10-WGP-NEXT:    s_clause 0x1
4280; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4281; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4282; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4283; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4284; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4285; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4286; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4287; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4288; GFX10-WGP-NEXT:    s_endpgm
4289;
4290; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store:
4291; GFX10-CU:       ; %bb.0: ; %entry
4292; GFX10-CU-NEXT:    s_clause 0x1
4293; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4294; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4296; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4297; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4298; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4299; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4300; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4301; GFX10-CU-NEXT:    s_endpgm
4302;
4303; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store:
4304; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4305; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4306; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4307; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4309; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4311; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4312; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4313; SKIP-CACHE-INV-NEXT:    s_endpgm
4314;
4315; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
4316; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4317; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4318; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4319; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4320; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4321; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4322; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4323; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4324; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4325;
4326; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
4327; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4328; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4329; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4330; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4331; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4333; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4334; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4335; GFX90A-TGSPLIT-NEXT:    s_endpgm
4336    i32 %in, i32 addrspace(1)* %out) {
4337entry:
4338  store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4
4339  ret void
4340}
4341
4342define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
4343; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw:
4344; GFX6:       ; %bb.0: ; %entry
4345; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4346; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4347; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4348; GFX6-NEXT:    s_mov_b32 s2, -1
4349; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4350; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4351; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4352; GFX6-NEXT:    s_endpgm
4353;
4354; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
4355; GFX7:       ; %bb.0: ; %entry
4356; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4357; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4359; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4360; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4361; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4362; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4363; GFX7-NEXT:    s_endpgm
4364;
4365; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw:
4366; GFX10-WGP:       ; %bb.0: ; %entry
4367; GFX10-WGP-NEXT:    s_clause 0x1
4368; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4369; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4370; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4371; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4372; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4373; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4374; GFX10-WGP-NEXT:    s_endpgm
4375;
4376; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
4377; GFX10-CU:       ; %bb.0: ; %entry
4378; GFX10-CU-NEXT:    s_clause 0x1
4379; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4380; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4381; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4382; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4383; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4384; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4385; GFX10-CU-NEXT:    s_endpgm
4386;
4387; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw:
4388; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4389; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4390; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4391; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4392; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4393; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4394; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4395; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4396; SKIP-CACHE-INV-NEXT:    s_endpgm
4397;
4398; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
4399; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4400; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4401; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4402; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4403; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4404; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4405; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4406; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4407;
4408; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
4409; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4410; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4411; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4412; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4413; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4414; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4415; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4416; GFX90A-TGSPLIT-NEXT:    s_endpgm
4417    i32 addrspace(1)* %out, i32 %in) {
4418entry:
4419  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic
4420  ret void
4421}
4422
4423define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
4424; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw:
4425; GFX6:       ; %bb.0: ; %entry
4426; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4427; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4428; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4429; GFX6-NEXT:    s_mov_b32 s2, -1
4430; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4431; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4432; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4433; GFX6-NEXT:    s_waitcnt vmcnt(0)
4434; GFX6-NEXT:    buffer_wbinvl1
4435; GFX6-NEXT:    s_endpgm
4436;
4437; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
4438; GFX7:       ; %bb.0: ; %entry
4439; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4440; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4441; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4442; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4443; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4444; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4445; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4446; GFX7-NEXT:    s_waitcnt vmcnt(0)
4447; GFX7-NEXT:    buffer_wbinvl1_vol
4448; GFX7-NEXT:    s_endpgm
4449;
4450; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw:
4451; GFX10-WGP:       ; %bb.0: ; %entry
4452; GFX10-WGP-NEXT:    s_clause 0x1
4453; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4454; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4455; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4456; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4457; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4458; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4459; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4460; GFX10-WGP-NEXT:    buffer_gl0_inv
4461; GFX10-WGP-NEXT:    buffer_gl1_inv
4462; GFX10-WGP-NEXT:    s_endpgm
4463;
4464; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
4465; GFX10-CU:       ; %bb.0: ; %entry
4466; GFX10-CU-NEXT:    s_clause 0x1
4467; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4468; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4469; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4470; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4471; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4472; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4473; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4474; GFX10-CU-NEXT:    buffer_gl0_inv
4475; GFX10-CU-NEXT:    buffer_gl1_inv
4476; GFX10-CU-NEXT:    s_endpgm
4477;
4478; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw:
4479; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4480; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4481; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4484; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4486; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4487; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4488; SKIP-CACHE-INV-NEXT:    s_endpgm
4489;
4490; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
4491; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4492; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4493; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4494; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4495; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4496; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4497; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4498; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4499; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4500; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4501;
4502; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
4503; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4504; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4505; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4506; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4507; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4508; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4509; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4510; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4511; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4512; GFX90A-TGSPLIT-NEXT:    s_endpgm
4513    i32 addrspace(1)* %out, i32 %in) {
4514entry:
4515  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
4516  ret void
4517}
4518
4519define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
4520; GFX6-LABEL: global_agent_one_as_release_atomicrmw:
4521; GFX6:       ; %bb.0: ; %entry
4522; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4523; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4524; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4525; GFX6-NEXT:    s_mov_b32 s2, -1
4526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4527; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4528; GFX6-NEXT:    s_waitcnt vmcnt(0)
4529; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4530; GFX6-NEXT:    s_endpgm
4531;
4532; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
4533; GFX7:       ; %bb.0: ; %entry
4534; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4535; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4536; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4537; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4538; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4539; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4540; GFX7-NEXT:    s_waitcnt vmcnt(0)
4541; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4542; GFX7-NEXT:    s_endpgm
4543;
4544; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw:
4545; GFX10-WGP:       ; %bb.0: ; %entry
4546; GFX10-WGP-NEXT:    s_clause 0x1
4547; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4548; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4549; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4550; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4551; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4552; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4553; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4554; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4555; GFX10-WGP-NEXT:    s_endpgm
4556;
4557; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw:
4558; GFX10-CU:       ; %bb.0: ; %entry
4559; GFX10-CU-NEXT:    s_clause 0x1
4560; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4561; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4562; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4563; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4564; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4565; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4566; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4567; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4568; GFX10-CU-NEXT:    s_endpgm
4569;
4570; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw:
4571; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4572; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4573; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4574; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4575; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4576; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4577; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4578; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4579; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4580; SKIP-CACHE-INV-NEXT:    s_endpgm
4581;
4582; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
4583; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4584; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4585; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4586; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4587; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4588; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4589; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4590; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4591; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4592;
4593; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
4594; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4595; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4596; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4597; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4598; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4599; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4600; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4601; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4602; GFX90A-TGSPLIT-NEXT:    s_endpgm
4603    i32 addrspace(1)* %out, i32 %in) {
4604entry:
4605  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release
4606  ret void
4607}
4608
4609define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
4610; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4611; GFX6:       ; %bb.0: ; %entry
4612; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4613; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4614; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4615; GFX6-NEXT:    s_mov_b32 s2, -1
4616; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4617; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4618; GFX6-NEXT:    s_waitcnt vmcnt(0)
4619; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4620; GFX6-NEXT:    s_waitcnt vmcnt(0)
4621; GFX6-NEXT:    buffer_wbinvl1
4622; GFX6-NEXT:    s_endpgm
4623;
4624; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4625; GFX7:       ; %bb.0: ; %entry
4626; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4627; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4628; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4629; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4630; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4631; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4632; GFX7-NEXT:    s_waitcnt vmcnt(0)
4633; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4634; GFX7-NEXT:    s_waitcnt vmcnt(0)
4635; GFX7-NEXT:    buffer_wbinvl1_vol
4636; GFX7-NEXT:    s_endpgm
4637;
4638; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4639; GFX10-WGP:       ; %bb.0: ; %entry
4640; GFX10-WGP-NEXT:    s_clause 0x1
4641; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4642; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4643; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4644; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4645; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4646; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4647; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4648; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4649; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4650; GFX10-WGP-NEXT:    buffer_gl0_inv
4651; GFX10-WGP-NEXT:    buffer_gl1_inv
4652; GFX10-WGP-NEXT:    s_endpgm
4653;
4654; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4655; GFX10-CU:       ; %bb.0: ; %entry
4656; GFX10-CU-NEXT:    s_clause 0x1
4657; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4658; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4659; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4660; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4661; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4662; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4663; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4664; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4665; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4666; GFX10-CU-NEXT:    buffer_gl0_inv
4667; GFX10-CU-NEXT:    buffer_gl1_inv
4668; GFX10-CU-NEXT:    s_endpgm
4669;
4670; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4671; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4672; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4673; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4674; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4675; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4676; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4678; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4679; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4680; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4681; SKIP-CACHE-INV-NEXT:    s_endpgm
4682;
4683; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4684; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4685; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4686; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4687; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4688; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4689; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4690; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4691; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4692; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4693; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4694; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4695;
4696; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
4697; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4698; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4699; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4700; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4701; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4702; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4703; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4704; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4705; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4706; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4707; GFX90A-TGSPLIT-NEXT:    s_endpgm
4708    i32 addrspace(1)* %out, i32 %in) {
4709entry:
4710  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
4711  ret void
4712}
4713
4714define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
4715; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4716; GFX6:       ; %bb.0: ; %entry
4717; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4718; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4719; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4720; GFX6-NEXT:    s_mov_b32 s2, -1
4721; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4722; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4723; GFX6-NEXT:    s_waitcnt vmcnt(0)
4724; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4725; GFX6-NEXT:    s_waitcnt vmcnt(0)
4726; GFX6-NEXT:    buffer_wbinvl1
4727; GFX6-NEXT:    s_endpgm
4728;
4729; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4730; GFX7:       ; %bb.0: ; %entry
4731; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4732; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4733; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4734; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4735; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4736; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4737; GFX7-NEXT:    s_waitcnt vmcnt(0)
4738; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4739; GFX7-NEXT:    s_waitcnt vmcnt(0)
4740; GFX7-NEXT:    buffer_wbinvl1_vol
4741; GFX7-NEXT:    s_endpgm
4742;
4743; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4744; GFX10-WGP:       ; %bb.0: ; %entry
4745; GFX10-WGP-NEXT:    s_clause 0x1
4746; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4747; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4749; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4751; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4752; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4753; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4754; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4755; GFX10-WGP-NEXT:    buffer_gl0_inv
4756; GFX10-WGP-NEXT:    buffer_gl1_inv
4757; GFX10-WGP-NEXT:    s_endpgm
4758;
4759; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4760; GFX10-CU:       ; %bb.0: ; %entry
4761; GFX10-CU-NEXT:    s_clause 0x1
4762; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4763; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4764; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4765; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4766; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4767; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4768; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4769; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4770; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4771; GFX10-CU-NEXT:    buffer_gl0_inv
4772; GFX10-CU-NEXT:    buffer_gl1_inv
4773; GFX10-CU-NEXT:    s_endpgm
4774;
4775; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4776; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4777; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4778; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4779; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4780; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4781; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4783; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4784; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4785; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4786; SKIP-CACHE-INV-NEXT:    s_endpgm
4787;
4788; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4789; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4790; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4791; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4792; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4793; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4794; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4796; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4797; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4798; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4799; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4800;
4801; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
4802; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4803; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4804; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4805; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4806; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4808; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4809; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4810; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4811; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4812; GFX90A-TGSPLIT-NEXT:    s_endpgm
4813    i32 addrspace(1)* %out, i32 %in) {
4814entry:
4815  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
4816  ret void
4817}
4818
4819define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
4820; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4821; GFX6:       ; %bb.0: ; %entry
4822; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4823; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4824; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4825; GFX6-NEXT:    s_mov_b32 s2, -1
4826; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4827; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4828; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
4829; GFX6-NEXT:    s_waitcnt vmcnt(0)
4830; GFX6-NEXT:    buffer_wbinvl1
4831; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4832; GFX6-NEXT:    s_endpgm
4833;
4834; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4835; GFX7:       ; %bb.0: ; %entry
4836; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4837; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4838; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4839; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4840; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4841; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4842; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4843; GFX7-NEXT:    s_waitcnt vmcnt(0)
4844; GFX7-NEXT:    buffer_wbinvl1_vol
4845; GFX7-NEXT:    flat_store_dword v[0:1], v2
4846; GFX7-NEXT:    s_endpgm
4847;
4848; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4849; GFX10-WGP:       ; %bb.0: ; %entry
4850; GFX10-WGP-NEXT:    s_clause 0x1
4851; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4852; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4853; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4854; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4855; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4856; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4857; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4858; GFX10-WGP-NEXT:    buffer_gl0_inv
4859; GFX10-WGP-NEXT:    buffer_gl1_inv
4860; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4861; GFX10-WGP-NEXT:    s_endpgm
4862;
4863; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4864; GFX10-CU:       ; %bb.0: ; %entry
4865; GFX10-CU-NEXT:    s_clause 0x1
4866; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4867; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4868; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4869; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4870; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4871; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4872; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4873; GFX10-CU-NEXT:    buffer_gl0_inv
4874; GFX10-CU-NEXT:    buffer_gl1_inv
4875; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4876; GFX10-CU-NEXT:    s_endpgm
4877;
4878; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4879; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4880; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4881; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4882; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4883; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4884; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4885; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4886; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
4887; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4888; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4889; SKIP-CACHE-INV-NEXT:    s_endpgm
4890;
4891; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4892; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4893; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4894; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4895; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4896; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4897; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4898; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4899; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4900; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4901; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4902; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4903;
4904; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
4905; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4906; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4907; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4908; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4909; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4910; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4911; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4912; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4913; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4914; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4915; GFX90A-TGSPLIT-NEXT:    s_endpgm
4916    i32 addrspace(1)* %out, i32 %in) {
4917entry:
4918  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
4919  store i32 %val, i32 addrspace(1)* %out, align 4
4920  ret void
4921}
4922
4923define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
4924; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
4925; GFX6:       ; %bb.0: ; %entry
4926; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4927; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4928; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4929; GFX6-NEXT:    s_mov_b32 s2, -1
4930; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4931; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4932; GFX6-NEXT:    s_waitcnt vmcnt(0)
4933; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
4934; GFX6-NEXT:    s_waitcnt vmcnt(0)
4935; GFX6-NEXT:    buffer_wbinvl1
4936; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4937; GFX6-NEXT:    s_endpgm
4938;
4939; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
4940; GFX7:       ; %bb.0: ; %entry
4941; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4942; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4943; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4944; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4945; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4946; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4947; GFX7-NEXT:    s_waitcnt vmcnt(0)
4948; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4949; GFX7-NEXT:    s_waitcnt vmcnt(0)
4950; GFX7-NEXT:    buffer_wbinvl1_vol
4951; GFX7-NEXT:    flat_store_dword v[0:1], v2
4952; GFX7-NEXT:    s_endpgm
4953;
4954; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
4955; GFX10-WGP:       ; %bb.0: ; %entry
4956; GFX10-WGP-NEXT:    s_clause 0x1
4957; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4958; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4959; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4960; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4961; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4962; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4963; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4964; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4965; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4966; GFX10-WGP-NEXT:    buffer_gl0_inv
4967; GFX10-WGP-NEXT:    buffer_gl1_inv
4968; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4969; GFX10-WGP-NEXT:    s_endpgm
4970;
4971; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
4972; GFX10-CU:       ; %bb.0: ; %entry
4973; GFX10-CU-NEXT:    s_clause 0x1
4974; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4975; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4976; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4977; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4978; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4979; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4980; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4981; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4982; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4983; GFX10-CU-NEXT:    buffer_gl0_inv
4984; GFX10-CU-NEXT:    buffer_gl1_inv
4985; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4986; GFX10-CU-NEXT:    s_endpgm
4987;
4988; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
4989; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4990; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4991; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4992; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4993; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4994; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4995; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4996; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4997; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
4998; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4999; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5000; SKIP-CACHE-INV-NEXT:    s_endpgm
5001;
5002; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
5003; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5004; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5005; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5006; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5007; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5008; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5009; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5010; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5011; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5012; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5013; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5014; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5015;
5016; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
5017; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5018; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5019; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5020; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5021; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5022; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5023; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5024; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5025; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5026; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5027; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5028; GFX90A-TGSPLIT-NEXT:    s_endpgm
5029    i32 addrspace(1)* %out, i32 %in) {
5030entry:
5031  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
5032  store i32 %val, i32 addrspace(1)* %out, align 4
5033  ret void
5034}
5035
5036define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
5037; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5038; GFX6:       ; %bb.0: ; %entry
5039; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5040; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
5041; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5042; GFX6-NEXT:    s_mov_b32 s2, -1
5043; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5044; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5045; GFX6-NEXT:    s_waitcnt vmcnt(0)
5046; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
5047; GFX6-NEXT:    s_waitcnt vmcnt(0)
5048; GFX6-NEXT:    buffer_wbinvl1
5049; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5050; GFX6-NEXT:    s_endpgm
5051;
5052; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5053; GFX7:       ; %bb.0: ; %entry
5054; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5055; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5056; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5057; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5058; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5059; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5060; GFX7-NEXT:    s_waitcnt vmcnt(0)
5061; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5062; GFX7-NEXT:    s_waitcnt vmcnt(0)
5063; GFX7-NEXT:    buffer_wbinvl1_vol
5064; GFX7-NEXT:    flat_store_dword v[0:1], v2
5065; GFX7-NEXT:    s_endpgm
5066;
5067; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5068; GFX10-WGP:       ; %bb.0: ; %entry
5069; GFX10-WGP-NEXT:    s_clause 0x1
5070; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5071; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5072; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5073; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5074; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5075; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5076; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5077; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5078; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5079; GFX10-WGP-NEXT:    buffer_gl0_inv
5080; GFX10-WGP-NEXT:    buffer_gl1_inv
5081; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
5082; GFX10-WGP-NEXT:    s_endpgm
5083;
5084; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5085; GFX10-CU:       ; %bb.0: ; %entry
5086; GFX10-CU-NEXT:    s_clause 0x1
5087; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5088; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5089; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5090; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5091; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5092; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5093; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5094; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5095; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5096; GFX10-CU-NEXT:    buffer_gl0_inv
5097; GFX10-CU-NEXT:    buffer_gl1_inv
5098; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
5099; GFX10-CU-NEXT:    s_endpgm
5100;
5101; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5102; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5103; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5104; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5105; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5106; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5107; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5108; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5109; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5110; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
5111; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5112; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5113; SKIP-CACHE-INV-NEXT:    s_endpgm
5114;
5115; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5116; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5117; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5118; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5119; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5120; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5121; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5122; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5123; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5124; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5125; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5126; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5127; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5128;
5129; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
5130; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5131; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5132; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5133; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5134; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5135; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5136; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5137; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5138; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5139; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5140; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5141; GFX90A-TGSPLIT-NEXT:    s_endpgm
5142    i32 addrspace(1)* %out, i32 %in) {
5143entry:
5144  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
5145  store i32 %val, i32 addrspace(1)* %out, align 4
5146  ret void
5147}
5148
5149define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
5150; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5151; GFX6:       ; %bb.0: ; %entry
5152; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5153; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5154; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5155; GFX6-NEXT:    s_mov_b32 s2, -1
5156; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5157; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5158; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5159; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5160; GFX6-NEXT:    s_endpgm
5161;
5162; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5163; GFX7:       ; %bb.0: ; %entry
5164; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5165; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5166; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5167; GFX7-NEXT:    s_add_u32 s0, s0, 16
5168; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5169; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5170; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5171; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5172; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5173; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5174; GFX7-NEXT:    s_endpgm
5175;
5176; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5177; GFX10-WGP:       ; %bb.0: ; %entry
5178; GFX10-WGP-NEXT:    s_clause 0x1
5179; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5180; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5181; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5182; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5183; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5184; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5185; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5186; GFX10-WGP-NEXT:    s_endpgm
5187;
5188; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5189; GFX10-CU:       ; %bb.0: ; %entry
5190; GFX10-CU-NEXT:    s_clause 0x1
5191; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5192; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5193; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5194; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5195; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5196; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5197; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5198; GFX10-CU-NEXT:    s_endpgm
5199;
5200; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5201; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5202; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5203; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5204; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5205; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5206; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5207; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5209; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5210; SKIP-CACHE-INV-NEXT:    s_endpgm
5211;
5212; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5213; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5214; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5215; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5216; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5217; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5218; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5219; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5220; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5221;
5222; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
5223; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5224; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5225; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5226; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5227; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5228; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5229; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5230; GFX90A-TGSPLIT-NEXT:    s_endpgm
5231    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5232entry:
5233  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5234  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
5235  ret void
5236}
5237
5238define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
5239; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5240; GFX6:       ; %bb.0: ; %entry
5241; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5242; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5243; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5244; GFX6-NEXT:    s_mov_b32 s2, -1
5245; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5246; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5247; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5248; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5249; GFX6-NEXT:    s_waitcnt vmcnt(0)
5250; GFX6-NEXT:    buffer_wbinvl1
5251; GFX6-NEXT:    s_endpgm
5252;
5253; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5254; GFX7:       ; %bb.0: ; %entry
5255; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5256; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5257; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5258; GFX7-NEXT:    s_add_u32 s0, s0, 16
5259; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5260; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5261; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5262; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5263; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5264; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5265; GFX7-NEXT:    s_waitcnt vmcnt(0)
5266; GFX7-NEXT:    buffer_wbinvl1_vol
5267; GFX7-NEXT:    s_endpgm
5268;
5269; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5270; GFX10-WGP:       ; %bb.0: ; %entry
5271; GFX10-WGP-NEXT:    s_clause 0x1
5272; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5273; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5274; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5275; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5276; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5277; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5278; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5279; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5280; GFX10-WGP-NEXT:    buffer_gl0_inv
5281; GFX10-WGP-NEXT:    buffer_gl1_inv
5282; GFX10-WGP-NEXT:    s_endpgm
5283;
5284; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5285; GFX10-CU:       ; %bb.0: ; %entry
5286; GFX10-CU-NEXT:    s_clause 0x1
5287; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5288; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5289; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5290; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5291; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5292; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5293; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5294; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5295; GFX10-CU-NEXT:    buffer_gl0_inv
5296; GFX10-CU-NEXT:    buffer_gl1_inv
5297; GFX10-CU-NEXT:    s_endpgm
5298;
5299; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5300; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5301; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5302; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5303; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5304; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5305; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5308; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5309; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5310; SKIP-CACHE-INV-NEXT:    s_endpgm
5311;
5312; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5313; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5314; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5316; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5317; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5318; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5319; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5320; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5321; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5322; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5323;
5324; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
5325; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5326; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5327; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5328; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5329; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5330; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5331; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5332; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5333; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5334; GFX90A-TGSPLIT-NEXT:    s_endpgm
5335    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5336entry:
5337  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5338  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
5339  ret void
5340}
5341
5342define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
5343; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5344; GFX6:       ; %bb.0: ; %entry
5345; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5346; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5347; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5348; GFX6-NEXT:    s_mov_b32 s2, -1
5349; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5350; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5351; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5352; GFX6-NEXT:    s_waitcnt vmcnt(0)
5353; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5354; GFX6-NEXT:    s_endpgm
5355;
5356; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5357; GFX7:       ; %bb.0: ; %entry
5358; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5359; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5360; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5361; GFX7-NEXT:    s_add_u32 s0, s0, 16
5362; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5363; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5364; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5365; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5366; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5367; GFX7-NEXT:    s_waitcnt vmcnt(0)
5368; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5369; GFX7-NEXT:    s_endpgm
5370;
5371; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5372; GFX10-WGP:       ; %bb.0: ; %entry
5373; GFX10-WGP-NEXT:    s_clause 0x1
5374; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5375; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5376; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5377; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5378; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5379; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5380; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5381; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5382; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5383; GFX10-WGP-NEXT:    s_endpgm
5384;
5385; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5386; GFX10-CU:       ; %bb.0: ; %entry
5387; GFX10-CU-NEXT:    s_clause 0x1
5388; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5389; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5390; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5391; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5392; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5393; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5394; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5395; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5396; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5397; GFX10-CU-NEXT:    s_endpgm
5398;
5399; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5400; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5401; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5402; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5403; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5404; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5405; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5406; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5408; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5409; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5410; SKIP-CACHE-INV-NEXT:    s_endpgm
5411;
5412; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5413; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5414; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5415; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5416; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5417; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5418; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5419; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5420; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5421; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5422;
5423; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
5424; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5425; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5426; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5427; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5428; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5429; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5430; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5431; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5432; GFX90A-TGSPLIT-NEXT:    s_endpgm
5433    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5434entry:
5435  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5436  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
5437  ret void
5438}
5439
5440define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
5441; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5442; GFX6:       ; %bb.0: ; %entry
5443; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5444; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5445; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5446; GFX6-NEXT:    s_mov_b32 s2, -1
5447; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5448; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5449; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5450; GFX6-NEXT:    s_waitcnt vmcnt(0)
5451; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5452; GFX6-NEXT:    s_waitcnt vmcnt(0)
5453; GFX6-NEXT:    buffer_wbinvl1
5454; GFX6-NEXT:    s_endpgm
5455;
5456; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5457; GFX7:       ; %bb.0: ; %entry
5458; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5459; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5460; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5461; GFX7-NEXT:    s_add_u32 s0, s0, 16
5462; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5463; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5464; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5465; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5466; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5467; GFX7-NEXT:    s_waitcnt vmcnt(0)
5468; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5469; GFX7-NEXT:    s_waitcnt vmcnt(0)
5470; GFX7-NEXT:    buffer_wbinvl1_vol
5471; GFX7-NEXT:    s_endpgm
5472;
5473; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5474; GFX10-WGP:       ; %bb.0: ; %entry
5475; GFX10-WGP-NEXT:    s_clause 0x1
5476; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5477; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5478; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5479; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5480; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5481; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5482; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5483; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5484; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5485; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5486; GFX10-WGP-NEXT:    buffer_gl0_inv
5487; GFX10-WGP-NEXT:    buffer_gl1_inv
5488; GFX10-WGP-NEXT:    s_endpgm
5489;
5490; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5491; GFX10-CU:       ; %bb.0: ; %entry
5492; GFX10-CU-NEXT:    s_clause 0x1
5493; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5494; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5495; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5496; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5497; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5499; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5500; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5501; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5502; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5503; GFX10-CU-NEXT:    buffer_gl0_inv
5504; GFX10-CU-NEXT:    buffer_gl1_inv
5505; GFX10-CU-NEXT:    s_endpgm
5506;
5507; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5508; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5509; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5510; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5511; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5512; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5513; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5516; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5517; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5518; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5519; SKIP-CACHE-INV-NEXT:    s_endpgm
5520;
5521; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5522; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5523; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5524; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5525; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5526; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5527; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5528; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5529; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5530; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5531; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5532; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5533;
5534; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
5535; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5536; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5537; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5538; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5539; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5540; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5541; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5542; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5543; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5544; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5545; GFX90A-TGSPLIT-NEXT:    s_endpgm
5546    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5547entry:
5548  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5549  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
5550  ret void
5551}
5552
5553define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
5554; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5555; GFX6:       ; %bb.0: ; %entry
5556; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5557; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5558; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5559; GFX6-NEXT:    s_mov_b32 s2, -1
5560; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5561; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5562; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5563; GFX6-NEXT:    s_waitcnt vmcnt(0)
5564; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5565; GFX6-NEXT:    s_waitcnt vmcnt(0)
5566; GFX6-NEXT:    buffer_wbinvl1
5567; GFX6-NEXT:    s_endpgm
5568;
5569; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5570; GFX7:       ; %bb.0: ; %entry
5571; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5572; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5573; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5574; GFX7-NEXT:    s_add_u32 s0, s0, 16
5575; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5576; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5577; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5578; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5579; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5580; GFX7-NEXT:    s_waitcnt vmcnt(0)
5581; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5582; GFX7-NEXT:    s_waitcnt vmcnt(0)
5583; GFX7-NEXT:    buffer_wbinvl1_vol
5584; GFX7-NEXT:    s_endpgm
5585;
5586; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5587; GFX10-WGP:       ; %bb.0: ; %entry
5588; GFX10-WGP-NEXT:    s_clause 0x1
5589; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5590; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5591; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5592; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5593; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5594; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5595; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5596; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5597; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5598; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5599; GFX10-WGP-NEXT:    buffer_gl0_inv
5600; GFX10-WGP-NEXT:    buffer_gl1_inv
5601; GFX10-WGP-NEXT:    s_endpgm
5602;
5603; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5604; GFX10-CU:       ; %bb.0: ; %entry
5605; GFX10-CU-NEXT:    s_clause 0x1
5606; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5607; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5608; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5609; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5610; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5611; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5612; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5613; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5614; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5615; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5616; GFX10-CU-NEXT:    buffer_gl0_inv
5617; GFX10-CU-NEXT:    buffer_gl1_inv
5618; GFX10-CU-NEXT:    s_endpgm
5619;
5620; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5621; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5622; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5623; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5624; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5625; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5626; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5627; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5628; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5629; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5630; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5631; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5632; SKIP-CACHE-INV-NEXT:    s_endpgm
5633;
5634; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5635; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5636; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5637; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5638; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5639; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5640; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5641; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5642; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5643; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5644; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5645; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5646;
5647; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
5648; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5649; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5650; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5651; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5652; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5653; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5654; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5655; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5656; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5657; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5658; GFX90A-TGSPLIT-NEXT:    s_endpgm
5659    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5660entry:
5661  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5662  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
5663  ret void
5664}
5665
5666define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
5667; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5668; GFX6:       ; %bb.0: ; %entry
5669; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5670; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5671; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5672; GFX6-NEXT:    s_mov_b32 s2, -1
5673; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5674; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5675; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5676; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5677; GFX6-NEXT:    s_waitcnt vmcnt(0)
5678; GFX6-NEXT:    buffer_wbinvl1
5679; GFX6-NEXT:    s_endpgm
5680;
5681; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5682; GFX7:       ; %bb.0: ; %entry
5683; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5684; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5685; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5686; GFX7-NEXT:    s_add_u32 s0, s0, 16
5687; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5688; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5689; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5690; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5691; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5692; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5693; GFX7-NEXT:    s_waitcnt vmcnt(0)
5694; GFX7-NEXT:    buffer_wbinvl1_vol
5695; GFX7-NEXT:    s_endpgm
5696;
5697; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5698; GFX10-WGP:       ; %bb.0: ; %entry
5699; GFX10-WGP-NEXT:    s_clause 0x1
5700; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5701; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5702; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5703; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5704; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5705; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5706; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5707; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5708; GFX10-WGP-NEXT:    buffer_gl0_inv
5709; GFX10-WGP-NEXT:    buffer_gl1_inv
5710; GFX10-WGP-NEXT:    s_endpgm
5711;
5712; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5713; GFX10-CU:       ; %bb.0: ; %entry
5714; GFX10-CU-NEXT:    s_clause 0x1
5715; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5716; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5717; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5718; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5719; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5720; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5721; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5722; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5723; GFX10-CU-NEXT:    buffer_gl0_inv
5724; GFX10-CU-NEXT:    buffer_gl1_inv
5725; GFX10-CU-NEXT:    s_endpgm
5726;
5727; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5728; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5729; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5730; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5731; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5732; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5733; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5734; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5735; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5736; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5737; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5738; SKIP-CACHE-INV-NEXT:    s_endpgm
5739;
5740; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5741; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5742; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5743; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5744; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5745; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5746; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5747; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5748; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5749; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5750; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5751;
5752; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
5753; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5754; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5755; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5756; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5757; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5758; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5759; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5760; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5761; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5762; GFX90A-TGSPLIT-NEXT:    s_endpgm
5763    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5764entry:
5765  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5766  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
5767  ret void
5768}
5769
5770define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
5771; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5772; GFX6:       ; %bb.0: ; %entry
5773; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5774; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5775; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5776; GFX6-NEXT:    s_mov_b32 s2, -1
5777; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5778; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5779; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5780; GFX6-NEXT:    s_waitcnt vmcnt(0)
5781; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5782; GFX6-NEXT:    s_waitcnt vmcnt(0)
5783; GFX6-NEXT:    buffer_wbinvl1
5784; GFX6-NEXT:    s_endpgm
5785;
5786; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5787; GFX7:       ; %bb.0: ; %entry
5788; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5789; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5790; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5791; GFX7-NEXT:    s_add_u32 s0, s0, 16
5792; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5793; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5794; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5795; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5796; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5797; GFX7-NEXT:    s_waitcnt vmcnt(0)
5798; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5799; GFX7-NEXT:    s_waitcnt vmcnt(0)
5800; GFX7-NEXT:    buffer_wbinvl1_vol
5801; GFX7-NEXT:    s_endpgm
5802;
5803; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5804; GFX10-WGP:       ; %bb.0: ; %entry
5805; GFX10-WGP-NEXT:    s_clause 0x1
5806; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5807; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5808; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5809; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5810; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5811; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5812; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5813; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5814; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5815; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5816; GFX10-WGP-NEXT:    buffer_gl0_inv
5817; GFX10-WGP-NEXT:    buffer_gl1_inv
5818; GFX10-WGP-NEXT:    s_endpgm
5819;
5820; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5821; GFX10-CU:       ; %bb.0: ; %entry
5822; GFX10-CU-NEXT:    s_clause 0x1
5823; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5824; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5825; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5826; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5827; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5828; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5829; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5830; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5831; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5832; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5833; GFX10-CU-NEXT:    buffer_gl0_inv
5834; GFX10-CU-NEXT:    buffer_gl1_inv
5835; GFX10-CU-NEXT:    s_endpgm
5836;
5837; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5838; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5839; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5840; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5841; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5842; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5843; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5845; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5846; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5847; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5848; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5849; SKIP-CACHE-INV-NEXT:    s_endpgm
5850;
5851; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5852; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5853; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5854; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5855; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5856; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5857; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5858; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5859; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5860; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5861; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5862; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5863;
5864; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
5865; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5866; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5867; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5868; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5869; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5870; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5871; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5872; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5873; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5874; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5875; GFX90A-TGSPLIT-NEXT:    s_endpgm
5876    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5877entry:
5878  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5879  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
5880  ret void
5881}
5882
5883define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
5884; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5885; GFX6:       ; %bb.0: ; %entry
5886; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5887; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5888; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5889; GFX6-NEXT:    s_mov_b32 s2, -1
5890; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5891; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5892; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5893; GFX6-NEXT:    s_waitcnt vmcnt(0)
5894; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5895; GFX6-NEXT:    s_waitcnt vmcnt(0)
5896; GFX6-NEXT:    buffer_wbinvl1
5897; GFX6-NEXT:    s_endpgm
5898;
5899; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5900; GFX7:       ; %bb.0: ; %entry
5901; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5902; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5903; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5904; GFX7-NEXT:    s_add_u32 s0, s0, 16
5905; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5906; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5907; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5908; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5909; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5910; GFX7-NEXT:    s_waitcnt vmcnt(0)
5911; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5912; GFX7-NEXT:    s_waitcnt vmcnt(0)
5913; GFX7-NEXT:    buffer_wbinvl1_vol
5914; GFX7-NEXT:    s_endpgm
5915;
5916; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5917; GFX10-WGP:       ; %bb.0: ; %entry
5918; GFX10-WGP-NEXT:    s_clause 0x1
5919; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5920; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5921; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5922; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5923; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5924; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5925; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5926; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5927; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5928; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5929; GFX10-WGP-NEXT:    buffer_gl0_inv
5930; GFX10-WGP-NEXT:    buffer_gl1_inv
5931; GFX10-WGP-NEXT:    s_endpgm
5932;
5933; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5934; GFX10-CU:       ; %bb.0: ; %entry
5935; GFX10-CU-NEXT:    s_clause 0x1
5936; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5937; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5938; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5939; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5941; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5942; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5943; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5944; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5945; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5946; GFX10-CU-NEXT:    buffer_gl0_inv
5947; GFX10-CU-NEXT:    buffer_gl1_inv
5948; GFX10-CU-NEXT:    s_endpgm
5949;
5950; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5951; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5952; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5953; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5954; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5955; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5956; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5957; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5958; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5959; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5960; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5961; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5962; SKIP-CACHE-INV-NEXT:    s_endpgm
5963;
5964; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5965; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5966; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5967; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5968; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5969; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5970; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5971; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5972; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5973; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5974; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5975; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5976;
5977; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
5978; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5979; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5980; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5981; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5982; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5983; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5984; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5985; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5986; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5987; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5988; GFX90A-TGSPLIT-NEXT:    s_endpgm
5989    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5990entry:
5991  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5992  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
5993  ret void
5994}
5995
5996define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
5997; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
5998; GFX6:       ; %bb.0: ; %entry
5999; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6000; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6001; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6002; GFX6-NEXT:    s_mov_b32 s2, -1
6003; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6004; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6005; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6006; GFX6-NEXT:    s_waitcnt vmcnt(0)
6007; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
6008; GFX6-NEXT:    s_waitcnt vmcnt(0)
6009; GFX6-NEXT:    buffer_wbinvl1
6010; GFX6-NEXT:    s_endpgm
6011;
6012; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6013; GFX7:       ; %bb.0: ; %entry
6014; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6015; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6016; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6017; GFX7-NEXT:    s_add_u32 s0, s0, 16
6018; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6019; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6020; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6021; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6022; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6023; GFX7-NEXT:    s_waitcnt vmcnt(0)
6024; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6025; GFX7-NEXT:    s_waitcnt vmcnt(0)
6026; GFX7-NEXT:    buffer_wbinvl1_vol
6027; GFX7-NEXT:    s_endpgm
6028;
6029; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6030; GFX10-WGP:       ; %bb.0: ; %entry
6031; GFX10-WGP-NEXT:    s_clause 0x1
6032; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6033; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6034; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6035; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6036; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6037; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6038; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6039; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6040; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6041; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6042; GFX10-WGP-NEXT:    buffer_gl0_inv
6043; GFX10-WGP-NEXT:    buffer_gl1_inv
6044; GFX10-WGP-NEXT:    s_endpgm
6045;
6046; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6047; GFX10-CU:       ; %bb.0: ; %entry
6048; GFX10-CU-NEXT:    s_clause 0x1
6049; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6050; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6051; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6052; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6053; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6054; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6055; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6056; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6057; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6058; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6059; GFX10-CU-NEXT:    buffer_gl0_inv
6060; GFX10-CU-NEXT:    buffer_gl1_inv
6061; GFX10-CU-NEXT:    s_endpgm
6062;
6063; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6064; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6065; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6066; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6067; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6068; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6069; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6072; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6073; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
6074; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6075; SKIP-CACHE-INV-NEXT:    s_endpgm
6076;
6077; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6078; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6079; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6080; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6081; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6083; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6084; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6085; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6086; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6087; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6088; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6089;
6090; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
6091; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6092; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6093; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6094; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6095; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6096; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6097; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6098; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6099; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6100; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6101; GFX90A-TGSPLIT-NEXT:    s_endpgm
6102    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6103entry:
6104  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6105  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
6106  ret void
6107}
6108
6109define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
6110; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6111; GFX6:       ; %bb.0: ; %entry
6112; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6113; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6114; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6115; GFX6-NEXT:    s_mov_b32 s2, -1
6116; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6117; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6118; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6119; GFX6-NEXT:    s_waitcnt vmcnt(0)
6120; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
6121; GFX6-NEXT:    s_waitcnt vmcnt(0)
6122; GFX6-NEXT:    buffer_wbinvl1
6123; GFX6-NEXT:    s_endpgm
6124;
6125; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6126; GFX7:       ; %bb.0: ; %entry
6127; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6128; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6129; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6130; GFX7-NEXT:    s_add_u32 s0, s0, 16
6131; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6132; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6133; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6134; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6135; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6136; GFX7-NEXT:    s_waitcnt vmcnt(0)
6137; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6138; GFX7-NEXT:    s_waitcnt vmcnt(0)
6139; GFX7-NEXT:    buffer_wbinvl1_vol
6140; GFX7-NEXT:    s_endpgm
6141;
6142; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6143; GFX10-WGP:       ; %bb.0: ; %entry
6144; GFX10-WGP-NEXT:    s_clause 0x1
6145; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6146; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6147; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6148; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6149; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6150; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6151; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6152; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6153; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6154; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6155; GFX10-WGP-NEXT:    buffer_gl0_inv
6156; GFX10-WGP-NEXT:    buffer_gl1_inv
6157; GFX10-WGP-NEXT:    s_endpgm
6158;
6159; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6160; GFX10-CU:       ; %bb.0: ; %entry
6161; GFX10-CU-NEXT:    s_clause 0x1
6162; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6163; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6164; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6165; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6166; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6167; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6168; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6169; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6170; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6171; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6172; GFX10-CU-NEXT:    buffer_gl0_inv
6173; GFX10-CU-NEXT:    buffer_gl1_inv
6174; GFX10-CU-NEXT:    s_endpgm
6175;
6176; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6177; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6178; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6179; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6180; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6181; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6182; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6184; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6185; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6186; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
6187; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6188; SKIP-CACHE-INV-NEXT:    s_endpgm
6189;
6190; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6191; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6192; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6193; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6194; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6195; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6196; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6197; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6198; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6199; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6200; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6201; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6202;
6203; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
6204; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6205; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6206; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6207; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6208; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6209; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6210; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6211; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6212; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6213; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6214; GFX90A-TGSPLIT-NEXT:    s_endpgm
6215    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6216entry:
6217  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6218  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
6219  ret void
6220}
6221
6222define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
6223; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6224; GFX6:       ; %bb.0: ; %entry
6225; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6226; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6227; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6228; GFX6-NEXT:    s_mov_b32 s2, -1
6229; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6230; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6231; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6232; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6233; GFX6-NEXT:    s_waitcnt vmcnt(0)
6234; GFX6-NEXT:    buffer_wbinvl1
6235; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6236; GFX6-NEXT:    s_endpgm
6237;
6238; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6239; GFX7:       ; %bb.0: ; %entry
6240; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6241; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6242; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6243; GFX7-NEXT:    s_add_u32 s4, s0, 16
6244; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6245; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6246; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6247; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6248; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6249; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6250; GFX7-NEXT:    s_waitcnt vmcnt(0)
6251; GFX7-NEXT:    buffer_wbinvl1_vol
6252; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6253; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6254; GFX7-NEXT:    flat_store_dword v[0:1], v2
6255; GFX7-NEXT:    s_endpgm
6256;
6257; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6258; GFX10-WGP:       ; %bb.0: ; %entry
6259; GFX10-WGP-NEXT:    s_clause 0x1
6260; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6261; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6262; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6263; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6264; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6265; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6266; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6267; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6268; GFX10-WGP-NEXT:    buffer_gl0_inv
6269; GFX10-WGP-NEXT:    buffer_gl1_inv
6270; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6271; GFX10-WGP-NEXT:    s_endpgm
6272;
6273; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6274; GFX10-CU:       ; %bb.0: ; %entry
6275; GFX10-CU-NEXT:    s_clause 0x1
6276; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6277; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6278; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6279; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6280; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6281; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6282; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6283; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6284; GFX10-CU-NEXT:    buffer_gl0_inv
6285; GFX10-CU-NEXT:    buffer_gl1_inv
6286; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6287; GFX10-CU-NEXT:    s_endpgm
6288;
6289; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6290; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6291; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6292; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6293; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6294; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6295; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6297; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6298; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6299; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6300; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6301; SKIP-CACHE-INV-NEXT:    s_endpgm
6302;
6303; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6304; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6305; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6306; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6307; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6308; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6309; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6310; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6311; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6312; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6313; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6314; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6315;
6316; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
6317; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6318; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6319; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6320; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6321; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6322; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6323; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6324; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6325; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6326; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6327; GFX90A-TGSPLIT-NEXT:    s_endpgm
6328    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6329entry:
6330  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6331  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
6332  %val0 = extractvalue { i32, i1 } %val, 0
6333  store i32 %val0, i32 addrspace(1)* %out, align 4
6334  ret void
6335}
6336
6337define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
6338; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6339; GFX6:       ; %bb.0: ; %entry
6340; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6341; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6342; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6343; GFX6-NEXT:    s_mov_b32 s2, -1
6344; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6345; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6346; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6347; GFX6-NEXT:    s_waitcnt vmcnt(0)
6348; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6349; GFX6-NEXT:    s_waitcnt vmcnt(0)
6350; GFX6-NEXT:    buffer_wbinvl1
6351; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6352; GFX6-NEXT:    s_endpgm
6353;
6354; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6355; GFX7:       ; %bb.0: ; %entry
6356; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6357; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6359; GFX7-NEXT:    s_add_u32 s4, s0, 16
6360; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6361; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6362; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6363; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6364; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6365; GFX7-NEXT:    s_waitcnt vmcnt(0)
6366; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6367; GFX7-NEXT:    s_waitcnt vmcnt(0)
6368; GFX7-NEXT:    buffer_wbinvl1_vol
6369; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6370; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6371; GFX7-NEXT:    flat_store_dword v[0:1], v2
6372; GFX7-NEXT:    s_endpgm
6373;
6374; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6375; GFX10-WGP:       ; %bb.0: ; %entry
6376; GFX10-WGP-NEXT:    s_clause 0x1
6377; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6378; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6379; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6380; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6381; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6382; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6383; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6384; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6385; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6386; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6387; GFX10-WGP-NEXT:    buffer_gl0_inv
6388; GFX10-WGP-NEXT:    buffer_gl1_inv
6389; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6390; GFX10-WGP-NEXT:    s_endpgm
6391;
6392; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6393; GFX10-CU:       ; %bb.0: ; %entry
6394; GFX10-CU-NEXT:    s_clause 0x1
6395; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6396; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6397; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6398; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6399; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6400; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6401; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6402; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6403; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6404; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6405; GFX10-CU-NEXT:    buffer_gl0_inv
6406; GFX10-CU-NEXT:    buffer_gl1_inv
6407; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6408; GFX10-CU-NEXT:    s_endpgm
6409;
6410; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6411; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6412; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6413; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6414; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6415; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6416; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6419; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6420; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6421; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6422; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6423; SKIP-CACHE-INV-NEXT:    s_endpgm
6424;
6425; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6426; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6427; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6428; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6429; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6430; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6431; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6432; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6433; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6434; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6435; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6436; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6437; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6438;
6439; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
6440; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6441; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6442; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6443; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6444; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6445; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6446; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6447; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6448; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6449; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6450; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6451; GFX90A-TGSPLIT-NEXT:    s_endpgm
6452    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6453entry:
6454  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6455  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
6456  %val0 = extractvalue { i32, i1 } %val, 0
6457  store i32 %val0, i32 addrspace(1)* %out, align 4
6458  ret void
6459}
6460
6461define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
6462; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6463; GFX6:       ; %bb.0: ; %entry
6464; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6465; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6466; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6467; GFX6-NEXT:    s_mov_b32 s2, -1
6468; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6469; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6470; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6471; GFX6-NEXT:    s_waitcnt vmcnt(0)
6472; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6473; GFX6-NEXT:    s_waitcnt vmcnt(0)
6474; GFX6-NEXT:    buffer_wbinvl1
6475; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6476; GFX6-NEXT:    s_endpgm
6477;
6478; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6479; GFX7:       ; %bb.0: ; %entry
6480; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6481; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6482; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6483; GFX7-NEXT:    s_add_u32 s4, s0, 16
6484; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6485; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6486; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6487; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6488; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6489; GFX7-NEXT:    s_waitcnt vmcnt(0)
6490; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6491; GFX7-NEXT:    s_waitcnt vmcnt(0)
6492; GFX7-NEXT:    buffer_wbinvl1_vol
6493; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6494; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6495; GFX7-NEXT:    flat_store_dword v[0:1], v2
6496; GFX7-NEXT:    s_endpgm
6497;
6498; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6499; GFX10-WGP:       ; %bb.0: ; %entry
6500; GFX10-WGP-NEXT:    s_clause 0x1
6501; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6502; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6503; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6504; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6505; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6506; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6507; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6508; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6509; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6510; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6511; GFX10-WGP-NEXT:    buffer_gl0_inv
6512; GFX10-WGP-NEXT:    buffer_gl1_inv
6513; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6514; GFX10-WGP-NEXT:    s_endpgm
6515;
6516; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6517; GFX10-CU:       ; %bb.0: ; %entry
6518; GFX10-CU-NEXT:    s_clause 0x1
6519; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6520; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6521; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6522; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6523; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6525; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6526; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6527; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6528; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6529; GFX10-CU-NEXT:    buffer_gl0_inv
6530; GFX10-CU-NEXT:    buffer_gl1_inv
6531; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6532; GFX10-CU-NEXT:    s_endpgm
6533;
6534; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6535; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6536; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6537; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6539; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6540; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6541; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6543; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6544; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6545; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6546; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6547; SKIP-CACHE-INV-NEXT:    s_endpgm
6548;
6549; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6550; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6552; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6553; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6554; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6555; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6556; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6557; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6558; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6559; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6560; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6561; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6562;
6563; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
6564; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6565; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6566; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6567; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6568; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6569; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6570; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6571; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6572; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6573; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6574; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6575; GFX90A-TGSPLIT-NEXT:    s_endpgm
6576    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6577entry:
6578  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6579  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
6580  %val0 = extractvalue { i32, i1 } %val, 0
6581  store i32 %val0, i32 addrspace(1)* %out, align 4
6582  ret void
6583}
6584
6585define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
6586; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6587; GFX6:       ; %bb.0: ; %entry
6588; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6589; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6590; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6591; GFX6-NEXT:    s_mov_b32 s2, -1
6592; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6593; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6594; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6595; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6596; GFX6-NEXT:    s_waitcnt vmcnt(0)
6597; GFX6-NEXT:    buffer_wbinvl1
6598; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6599; GFX6-NEXT:    s_endpgm
6600;
6601; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6602; GFX7:       ; %bb.0: ; %entry
6603; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6604; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6605; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6606; GFX7-NEXT:    s_add_u32 s4, s0, 16
6607; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6608; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6609; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6610; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6611; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6612; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6613; GFX7-NEXT:    s_waitcnt vmcnt(0)
6614; GFX7-NEXT:    buffer_wbinvl1_vol
6615; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6616; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6617; GFX7-NEXT:    flat_store_dword v[0:1], v2
6618; GFX7-NEXT:    s_endpgm
6619;
6620; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6621; GFX10-WGP:       ; %bb.0: ; %entry
6622; GFX10-WGP-NEXT:    s_clause 0x1
6623; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6624; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6625; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6626; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6627; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6628; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6629; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6630; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6631; GFX10-WGP-NEXT:    buffer_gl0_inv
6632; GFX10-WGP-NEXT:    buffer_gl1_inv
6633; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6634; GFX10-WGP-NEXT:    s_endpgm
6635;
6636; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6637; GFX10-CU:       ; %bb.0: ; %entry
6638; GFX10-CU-NEXT:    s_clause 0x1
6639; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6640; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6641; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6642; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6643; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6644; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6645; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6646; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6647; GFX10-CU-NEXT:    buffer_gl0_inv
6648; GFX10-CU-NEXT:    buffer_gl1_inv
6649; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6650; GFX10-CU-NEXT:    s_endpgm
6651;
6652; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6653; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6654; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6655; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6656; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6657; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6658; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6659; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6661; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6662; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6663; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6664; SKIP-CACHE-INV-NEXT:    s_endpgm
6665;
6666; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6667; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6668; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6669; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6670; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6671; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6672; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6673; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6674; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6675; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6676; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6677; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6678;
6679; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
6680; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6681; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6682; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6683; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6684; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6685; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6686; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6687; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6688; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6689; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6690; GFX90A-TGSPLIT-NEXT:    s_endpgm
6691    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6692entry:
6693  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6694  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
6695  %val0 = extractvalue { i32, i1 } %val, 0
6696  store i32 %val0, i32 addrspace(1)* %out, align 4
6697  ret void
6698}
6699
6700define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
6701; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6702; GFX6:       ; %bb.0: ; %entry
6703; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6704; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6705; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6706; GFX6-NEXT:    s_mov_b32 s2, -1
6707; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6708; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6709; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6710; GFX6-NEXT:    s_waitcnt vmcnt(0)
6711; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6712; GFX6-NEXT:    s_waitcnt vmcnt(0)
6713; GFX6-NEXT:    buffer_wbinvl1
6714; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6715; GFX6-NEXT:    s_endpgm
6716;
6717; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6718; GFX7:       ; %bb.0: ; %entry
6719; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6720; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6722; GFX7-NEXT:    s_add_u32 s4, s0, 16
6723; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6724; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6725; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6726; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6727; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6728; GFX7-NEXT:    s_waitcnt vmcnt(0)
6729; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6730; GFX7-NEXT:    s_waitcnt vmcnt(0)
6731; GFX7-NEXT:    buffer_wbinvl1_vol
6732; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6733; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6734; GFX7-NEXT:    flat_store_dword v[0:1], v2
6735; GFX7-NEXT:    s_endpgm
6736;
6737; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6738; GFX10-WGP:       ; %bb.0: ; %entry
6739; GFX10-WGP-NEXT:    s_clause 0x1
6740; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6741; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6742; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6743; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6744; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6745; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6746; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6747; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6748; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6749; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6750; GFX10-WGP-NEXT:    buffer_gl0_inv
6751; GFX10-WGP-NEXT:    buffer_gl1_inv
6752; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6753; GFX10-WGP-NEXT:    s_endpgm
6754;
6755; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6756; GFX10-CU:       ; %bb.0: ; %entry
6757; GFX10-CU-NEXT:    s_clause 0x1
6758; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6759; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6760; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6761; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6762; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6763; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6764; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6765; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6766; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6767; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6768; GFX10-CU-NEXT:    buffer_gl0_inv
6769; GFX10-CU-NEXT:    buffer_gl1_inv
6770; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6771; GFX10-CU-NEXT:    s_endpgm
6772;
6773; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6774; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6775; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6776; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6777; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6778; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6779; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6782; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6783; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6784; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6785; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6786; SKIP-CACHE-INV-NEXT:    s_endpgm
6787;
6788; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6789; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6790; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6791; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6792; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6793; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6794; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6796; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6797; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6798; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6799; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6800; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6801;
6802; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
6803; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6804; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6805; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6806; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6807; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6808; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6809; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6810; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6811; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6812; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6813; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6814; GFX90A-TGSPLIT-NEXT:    s_endpgm
6815    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6816entry:
6817  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6818  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
6819  %val0 = extractvalue { i32, i1 } %val, 0
6820  store i32 %val0, i32 addrspace(1)* %out, align 4
6821  ret void
6822}
6823
6824define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
6825; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6826; GFX6:       ; %bb.0: ; %entry
6827; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6828; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6829; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6830; GFX6-NEXT:    s_mov_b32 s2, -1
6831; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6832; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6833; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6834; GFX6-NEXT:    s_waitcnt vmcnt(0)
6835; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6836; GFX6-NEXT:    s_waitcnt vmcnt(0)
6837; GFX6-NEXT:    buffer_wbinvl1
6838; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6839; GFX6-NEXT:    s_endpgm
6840;
6841; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6842; GFX7:       ; %bb.0: ; %entry
6843; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6844; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6845; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6846; GFX7-NEXT:    s_add_u32 s4, s0, 16
6847; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6848; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6849; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6850; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6851; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6852; GFX7-NEXT:    s_waitcnt vmcnt(0)
6853; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6854; GFX7-NEXT:    s_waitcnt vmcnt(0)
6855; GFX7-NEXT:    buffer_wbinvl1_vol
6856; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6857; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6858; GFX7-NEXT:    flat_store_dword v[0:1], v2
6859; GFX7-NEXT:    s_endpgm
6860;
6861; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6862; GFX10-WGP:       ; %bb.0: ; %entry
6863; GFX10-WGP-NEXT:    s_clause 0x1
6864; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6865; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6866; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6867; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6868; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6869; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6870; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6871; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6872; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6873; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6874; GFX10-WGP-NEXT:    buffer_gl0_inv
6875; GFX10-WGP-NEXT:    buffer_gl1_inv
6876; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6877; GFX10-WGP-NEXT:    s_endpgm
6878;
6879; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6880; GFX10-CU:       ; %bb.0: ; %entry
6881; GFX10-CU-NEXT:    s_clause 0x1
6882; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6883; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6884; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6885; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6886; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6887; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6888; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6889; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6890; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6891; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6892; GFX10-CU-NEXT:    buffer_gl0_inv
6893; GFX10-CU-NEXT:    buffer_gl1_inv
6894; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6895; GFX10-CU-NEXT:    s_endpgm
6896;
6897; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6898; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6899; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6900; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6901; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6902; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6903; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6904; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6905; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6906; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6907; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6908; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6909; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6910; SKIP-CACHE-INV-NEXT:    s_endpgm
6911;
6912; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6913; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6914; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6915; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6916; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6917; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6918; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6919; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6920; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6921; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6922; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6923; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6924; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6925;
6926; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
6927; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6928; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6929; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6930; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6931; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6932; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6933; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6934; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6935; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6936; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6937; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6938; GFX90A-TGSPLIT-NEXT:    s_endpgm
6939    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6940entry:
6941  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6942  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
6943  %val0 = extractvalue { i32, i1 } %val, 0
6944  store i32 %val0, i32 addrspace(1)* %out, align 4
6945  ret void
6946}
6947
6948define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
6949; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
6950; GFX6:       ; %bb.0: ; %entry
6951; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6952; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6953; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6954; GFX6-NEXT:    s_mov_b32 s2, -1
6955; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6956; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6957; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6958; GFX6-NEXT:    s_waitcnt vmcnt(0)
6959; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6960; GFX6-NEXT:    s_waitcnt vmcnt(0)
6961; GFX6-NEXT:    buffer_wbinvl1
6962; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6963; GFX6-NEXT:    s_endpgm
6964;
6965; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
6966; GFX7:       ; %bb.0: ; %entry
6967; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6968; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6969; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6970; GFX7-NEXT:    s_add_u32 s4, s0, 16
6971; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6972; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6973; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6974; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6975; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6976; GFX7-NEXT:    s_waitcnt vmcnt(0)
6977; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6978; GFX7-NEXT:    s_waitcnt vmcnt(0)
6979; GFX7-NEXT:    buffer_wbinvl1_vol
6980; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6981; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6982; GFX7-NEXT:    flat_store_dword v[0:1], v2
6983; GFX7-NEXT:    s_endpgm
6984;
6985; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
6986; GFX10-WGP:       ; %bb.0: ; %entry
6987; GFX10-WGP-NEXT:    s_clause 0x1
6988; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6989; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6990; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6991; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6992; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6993; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6994; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6995; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6996; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6997; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6998; GFX10-WGP-NEXT:    buffer_gl0_inv
6999; GFX10-WGP-NEXT:    buffer_gl1_inv
7000; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
7001; GFX10-WGP-NEXT:    s_endpgm
7002;
7003; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
7004; GFX10-CU:       ; %bb.0: ; %entry
7005; GFX10-CU-NEXT:    s_clause 0x1
7006; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7007; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7008; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
7009; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7010; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7011; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7012; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7013; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7014; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7015; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7016; GFX10-CU-NEXT:    buffer_gl0_inv
7017; GFX10-CU-NEXT:    buffer_gl1_inv
7018; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
7019; GFX10-CU-NEXT:    s_endpgm
7020;
7021; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
7022; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7023; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7024; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7025; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
7026; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
7027; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7028; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7029; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7030; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7031; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7032; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7033; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7034; SKIP-CACHE-INV-NEXT:    s_endpgm
7035;
7036; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
7037; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7038; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7039; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7040; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7041; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7042; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7043; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7044; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7045; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7046; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7047; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7048; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7049;
7050; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
7051; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7052; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7053; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7054; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7055; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7056; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7057; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7058; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7059; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7060; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7061; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7062; GFX90A-TGSPLIT-NEXT:    s_endpgm
7063    i32 addrspace(1)* %out, i32 %in, i32 %old) {
7064entry:
7065  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
7066  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
7067  %val0 = extractvalue { i32, i1 } %val, 0
7068  store i32 %val0, i32 addrspace(1)* %out, align 4
7069  ret void
7070}
7071
7072define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
7073; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7074; GFX6:       ; %bb.0: ; %entry
7075; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7076; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
7077; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
7078; GFX6-NEXT:    s_mov_b32 s2, -1
7079; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7080; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7081; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7082; GFX6-NEXT:    s_waitcnt vmcnt(0)
7083; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7084; GFX6-NEXT:    s_waitcnt vmcnt(0)
7085; GFX6-NEXT:    buffer_wbinvl1
7086; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7087; GFX6-NEXT:    s_endpgm
7088;
7089; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7090; GFX7:       ; %bb.0: ; %entry
7091; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7092; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7093; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7094; GFX7-NEXT:    s_add_u32 s4, s0, 16
7095; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7096; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7097; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7098; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7099; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7100; GFX7-NEXT:    s_waitcnt vmcnt(0)
7101; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7102; GFX7-NEXT:    s_waitcnt vmcnt(0)
7103; GFX7-NEXT:    buffer_wbinvl1_vol
7104; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7105; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7106; GFX7-NEXT:    flat_store_dword v[0:1], v2
7107; GFX7-NEXT:    s_endpgm
7108;
7109; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7110; GFX10-WGP:       ; %bb.0: ; %entry
7111; GFX10-WGP-NEXT:    s_clause 0x1
7112; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7113; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7114; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
7115; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7116; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7117; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7118; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7119; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7120; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7121; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7122; GFX10-WGP-NEXT:    buffer_gl0_inv
7123; GFX10-WGP-NEXT:    buffer_gl1_inv
7124; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
7125; GFX10-WGP-NEXT:    s_endpgm
7126;
7127; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7128; GFX10-CU:       ; %bb.0: ; %entry
7129; GFX10-CU-NEXT:    s_clause 0x1
7130; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7131; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7132; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
7133; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7134; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7135; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7136; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7137; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7138; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7139; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7140; GFX10-CU-NEXT:    buffer_gl0_inv
7141; GFX10-CU-NEXT:    buffer_gl1_inv
7142; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
7143; GFX10-CU-NEXT:    s_endpgm
7144;
7145; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7146; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7147; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7148; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7149; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
7150; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
7151; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7152; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7153; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7154; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7155; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7156; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7157; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7158; SKIP-CACHE-INV-NEXT:    s_endpgm
7159;
7160; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7161; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7162; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7163; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7164; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7165; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7166; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7167; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7168; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7169; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7170; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7171; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7172; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7173;
7174; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
7175; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7176; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7177; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7178; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7179; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7180; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7181; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7182; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7183; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7184; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7185; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7186; GFX90A-TGSPLIT-NEXT:    s_endpgm
7187    i32 addrspace(1)* %out, i32 %in, i32 %old) {
7188entry:
7189  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
7190  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
7191  %val0 = extractvalue { i32, i1 } %val, 0
7192  store i32 %val0, i32 addrspace(1)* %out, align 4
7193  ret void
7194}
7195
7196