1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9
10define amdgpu_kernel void @global_system_unordered_load(
11; GFX6-LABEL: global_system_unordered_load:
12; GFX6:       ; %bb.0: ; %entry
13; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
14; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
15; GFX6-NEXT:    s_mov_b32 s2, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    s_mov_b32 s0, s4
18; GFX6-NEXT:    s_mov_b32 s1, s5
19; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
20; GFX6-NEXT:    s_mov_b32 s4, s6
21; GFX6-NEXT:    s_mov_b32 s5, s7
22; GFX6-NEXT:    s_mov_b32 s6, s2
23; GFX6-NEXT:    s_mov_b32 s7, s3
24; GFX6-NEXT:    s_waitcnt vmcnt(0)
25; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
26; GFX6-NEXT:    s_endpgm
27;
28; GFX7-LABEL: global_system_unordered_load:
29; GFX7:       ; %bb.0: ; %entry
30; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
31; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX7-NEXT:    v_mov_b32_e32 v0, s0
33; GFX7-NEXT:    v_mov_b32_e32 v1, s1
34; GFX7-NEXT:    flat_load_dword v0, v[0:1]
35; GFX7-NEXT:    v_mov_b32_e32 v2, s2
36; GFX7-NEXT:    v_mov_b32_e32 v3, s3
37; GFX7-NEXT:    s_waitcnt vmcnt(0)
38; GFX7-NEXT:    flat_store_dword v[2:3], v0
39; GFX7-NEXT:    s_endpgm
40;
41; GFX10-WGP-LABEL: global_system_unordered_load:
42; GFX10-WGP:       ; %bb.0: ; %entry
43; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
44; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
45; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
47; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
48; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
49; GFX10-WGP-NEXT:    s_endpgm
50;
51; GFX10-CU-LABEL: global_system_unordered_load:
52; GFX10-CU:       ; %bb.0: ; %entry
53; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
54; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
55; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
57; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
59; GFX10-CU-NEXT:    s_endpgm
60;
61; SKIP-CACHE-INV-LABEL: global_system_unordered_load:
62; SKIP-CACHE-INV:       ; %bb.0: ; %entry
63; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
64; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
66; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
67; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
69; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
70; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
71; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
72; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
73; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
74; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
75; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
76; SKIP-CACHE-INV-NEXT:    s_endpgm
77;
78; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load:
79; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
80; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
81; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
82; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
84; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
85; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
86; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
87;
88; GFX90A-TGSPLIT-LABEL: global_system_unordered_load:
89; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
90; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
91; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
92; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
94; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
95; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
96; GFX90A-TGSPLIT-NEXT:    s_endpgm
97    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
98entry:
99  %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4
100  store i32 %val, i32 addrspace(1)* %out
101  ret void
102}
103
104define amdgpu_kernel void @global_system_monotonic_load(
105; GFX6-LABEL: global_system_monotonic_load:
106; GFX6:       ; %bb.0: ; %entry
107; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
108; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
109; GFX6-NEXT:    s_mov_b32 s2, -1
110; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX6-NEXT:    s_mov_b32 s0, s4
112; GFX6-NEXT:    s_mov_b32 s1, s5
113; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
114; GFX6-NEXT:    s_mov_b32 s4, s6
115; GFX6-NEXT:    s_mov_b32 s5, s7
116; GFX6-NEXT:    s_mov_b32 s6, s2
117; GFX6-NEXT:    s_mov_b32 s7, s3
118; GFX6-NEXT:    s_waitcnt vmcnt(0)
119; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
120; GFX6-NEXT:    s_endpgm
121;
122; GFX7-LABEL: global_system_monotonic_load:
123; GFX7:       ; %bb.0: ; %entry
124; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
125; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX7-NEXT:    v_mov_b32_e32 v0, s0
127; GFX7-NEXT:    v_mov_b32_e32 v1, s1
128; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
129; GFX7-NEXT:    v_mov_b32_e32 v2, s2
130; GFX7-NEXT:    v_mov_b32_e32 v3, s3
131; GFX7-NEXT:    s_waitcnt vmcnt(0)
132; GFX7-NEXT:    flat_store_dword v[2:3], v0
133; GFX7-NEXT:    s_endpgm
134;
135; GFX10-WGP-LABEL: global_system_monotonic_load:
136; GFX10-WGP:       ; %bb.0: ; %entry
137; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
138; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
139; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
141; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
142; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
143; GFX10-WGP-NEXT:    s_endpgm
144;
145; GFX10-CU-LABEL: global_system_monotonic_load:
146; GFX10-CU:       ; %bb.0: ; %entry
147; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
148; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
149; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
151; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
152; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
153; GFX10-CU-NEXT:    s_endpgm
154;
155; SKIP-CACHE-INV-LABEL: global_system_monotonic_load:
156; SKIP-CACHE-INV:       ; %bb.0: ; %entry
157; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
158; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
159; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
160; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
161; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
162; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
163; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
164; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
165; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
166; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
167; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
168; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
169; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
170; SKIP-CACHE-INV-NEXT:    s_endpgm
171;
172; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load:
173; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
174; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
175; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
176; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
178; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
179; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
180; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
181;
182; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load:
183; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
184; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
185; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
186; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
188; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
189; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
190; GFX90A-TGSPLIT-NEXT:    s_endpgm
191    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
192entry:
193  %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4
194  store i32 %val, i32 addrspace(1)* %out
195  ret void
196}
197
198define amdgpu_kernel void @global_system_acquire_load(
199; GFX6-LABEL: global_system_acquire_load:
200; GFX6:       ; %bb.0: ; %entry
201; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
202; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
203; GFX6-NEXT:    s_mov_b32 s2, -1
204; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX6-NEXT:    s_mov_b32 s0, s4
206; GFX6-NEXT:    s_mov_b32 s1, s5
207; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
208; GFX6-NEXT:    s_waitcnt vmcnt(0)
209; GFX6-NEXT:    buffer_wbinvl1
210; GFX6-NEXT:    s_mov_b32 s4, s6
211; GFX6-NEXT:    s_mov_b32 s5, s7
212; GFX6-NEXT:    s_mov_b32 s6, s2
213; GFX6-NEXT:    s_mov_b32 s7, s3
214; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
215; GFX6-NEXT:    s_endpgm
216;
217; GFX7-LABEL: global_system_acquire_load:
218; GFX7:       ; %bb.0: ; %entry
219; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
220; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX7-NEXT:    v_mov_b32_e32 v0, s0
222; GFX7-NEXT:    v_mov_b32_e32 v1, s1
223; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
224; GFX7-NEXT:    s_waitcnt vmcnt(0)
225; GFX7-NEXT:    buffer_wbinvl1_vol
226; GFX7-NEXT:    v_mov_b32_e32 v2, s2
227; GFX7-NEXT:    v_mov_b32_e32 v3, s3
228; GFX7-NEXT:    flat_store_dword v[2:3], v0
229; GFX7-NEXT:    s_endpgm
230;
231; GFX10-WGP-LABEL: global_system_acquire_load:
232; GFX10-WGP:       ; %bb.0: ; %entry
233; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
234; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
235; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
237; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
238; GFX10-WGP-NEXT:    buffer_gl0_inv
239; GFX10-WGP-NEXT:    buffer_gl1_inv
240; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
241; GFX10-WGP-NEXT:    s_endpgm
242;
243; GFX10-CU-LABEL: global_system_acquire_load:
244; GFX10-CU:       ; %bb.0: ; %entry
245; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
246; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
247; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
249; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
250; GFX10-CU-NEXT:    buffer_gl0_inv
251; GFX10-CU-NEXT:    buffer_gl1_inv
252; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
253; GFX10-CU-NEXT:    s_endpgm
254;
255; SKIP-CACHE-INV-LABEL: global_system_acquire_load:
256; SKIP-CACHE-INV:       ; %bb.0: ; %entry
257; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
258; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
259; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
260; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
261; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
262; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
263; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
264; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
265; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
266; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
267; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
268; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
269; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
270; SKIP-CACHE-INV-NEXT:    s_endpgm
271;
272; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load:
273; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
274; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
275; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
276; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
278; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
279; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
280; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
281; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
282; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
283;
284; GFX90A-TGSPLIT-LABEL: global_system_acquire_load:
285; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
286; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
287; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
288; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
290; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
291; GFX90A-TGSPLIT-NEXT:    buffer_invl2
292; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
293; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
294; GFX90A-TGSPLIT-NEXT:    s_endpgm
295    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
296entry:
297  %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4
298  store i32 %val, i32 addrspace(1)* %out
299  ret void
300}
301
302define amdgpu_kernel void @global_system_seq_cst_load(
303; GFX6-LABEL: global_system_seq_cst_load:
304; GFX6:       ; %bb.0: ; %entry
305; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
306; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
307; GFX6-NEXT:    s_mov_b32 s2, -1
308; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX6-NEXT:    s_mov_b32 s0, s4
310; GFX6-NEXT:    s_mov_b32 s1, s5
311; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
312; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
313; GFX6-NEXT:    s_waitcnt vmcnt(0)
314; GFX6-NEXT:    buffer_wbinvl1
315; GFX6-NEXT:    s_mov_b32 s4, s6
316; GFX6-NEXT:    s_mov_b32 s5, s7
317; GFX6-NEXT:    s_mov_b32 s6, s2
318; GFX6-NEXT:    s_mov_b32 s7, s3
319; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
320; GFX6-NEXT:    s_endpgm
321;
322; GFX7-LABEL: global_system_seq_cst_load:
323; GFX7:       ; %bb.0: ; %entry
324; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
325; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX7-NEXT:    v_mov_b32_e32 v0, s0
327; GFX7-NEXT:    v_mov_b32_e32 v1, s1
328; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
329; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
330; GFX7-NEXT:    s_waitcnt vmcnt(0)
331; GFX7-NEXT:    buffer_wbinvl1_vol
332; GFX7-NEXT:    v_mov_b32_e32 v2, s2
333; GFX7-NEXT:    v_mov_b32_e32 v3, s3
334; GFX7-NEXT:    flat_store_dword v[2:3], v0
335; GFX7-NEXT:    s_endpgm
336;
337; GFX10-WGP-LABEL: global_system_seq_cst_load:
338; GFX10-WGP:       ; %bb.0: ; %entry
339; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
340; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
341; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
342; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
343; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
344; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
345; GFX10-WGP-NEXT:    buffer_gl0_inv
346; GFX10-WGP-NEXT:    buffer_gl1_inv
347; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
348; GFX10-WGP-NEXT:    s_endpgm
349;
350; GFX10-CU-LABEL: global_system_seq_cst_load:
351; GFX10-CU:       ; %bb.0: ; %entry
352; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
353; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
354; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
355; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
356; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
357; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
358; GFX10-CU-NEXT:    buffer_gl0_inv
359; GFX10-CU-NEXT:    buffer_gl1_inv
360; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
361; GFX10-CU-NEXT:    s_endpgm
362;
363; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load:
364; SKIP-CACHE-INV:       ; %bb.0: ; %entry
365; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
366; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
368; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
369; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
370; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
371; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
372; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
373; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
374; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
375; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
376; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
377; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
378; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
379; SKIP-CACHE-INV-NEXT:    s_endpgm
380;
381; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
382; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
383; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
384; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
385; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
386; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
387; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
388; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
389; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
390; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
391; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
392;
393; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load:
394; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
395; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
396; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
397; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
398; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
399; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
400; GFX90A-TGSPLIT-NEXT:    buffer_invl2
401; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
402; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
403; GFX90A-TGSPLIT-NEXT:    s_endpgm
404    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
405entry:
406  %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
407  store i32 %val, i32 addrspace(1)* %out
408  ret void
409}
410
411define amdgpu_kernel void @global_system_unordered_store(
412; GFX6-LABEL: global_system_unordered_store:
413; GFX6:       ; %bb.0: ; %entry
414; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
415; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
416; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
417; GFX6-NEXT:    s_mov_b32 s2, -1
418; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX6-NEXT:    v_mov_b32_e32 v0, s6
420; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
421; GFX6-NEXT:    s_endpgm
422;
423; GFX7-LABEL: global_system_unordered_store:
424; GFX7:       ; %bb.0: ; %entry
425; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
426; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
427; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX7-NEXT:    v_mov_b32_e32 v2, s2
429; GFX7-NEXT:    v_mov_b32_e32 v0, s0
430; GFX7-NEXT:    v_mov_b32_e32 v1, s1
431; GFX7-NEXT:    flat_store_dword v[0:1], v2
432; GFX7-NEXT:    s_endpgm
433;
434; GFX10-WGP-LABEL: global_system_unordered_store:
435; GFX10-WGP:       ; %bb.0: ; %entry
436; GFX10-WGP-NEXT:    s_clause 0x1
437; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
438; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
439; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
440; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
442; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
443; GFX10-WGP-NEXT:    s_endpgm
444;
445; GFX10-CU-LABEL: global_system_unordered_store:
446; GFX10-CU:       ; %bb.0: ; %entry
447; GFX10-CU-NEXT:    s_clause 0x1
448; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
449; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
450; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
451; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
453; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
454; GFX10-CU-NEXT:    s_endpgm
455;
456; SKIP-CACHE-INV-LABEL: global_system_unordered_store:
457; SKIP-CACHE-INV:       ; %bb.0: ; %entry
458; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
459; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
460; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
461; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
462; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
463; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
464; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
465; SKIP-CACHE-INV-NEXT:    s_endpgm
466;
467; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store:
468; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
469; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
470; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
471; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
472; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
474; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
475; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
476;
477; GFX90A-TGSPLIT-LABEL: global_system_unordered_store:
478; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
479; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
480; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
481; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
482; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
484; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
485; GFX90A-TGSPLIT-NEXT:    s_endpgm
486    i32 %in, i32 addrspace(1)* %out) {
487entry:
488  store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4
489  ret void
490}
491
492define amdgpu_kernel void @global_system_monotonic_store(
493; GFX6-LABEL: global_system_monotonic_store:
494; GFX6:       ; %bb.0: ; %entry
495; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
496; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
497; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
498; GFX6-NEXT:    s_mov_b32 s2, -1
499; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX6-NEXT:    v_mov_b32_e32 v0, s6
501; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
502; GFX6-NEXT:    s_endpgm
503;
504; GFX7-LABEL: global_system_monotonic_store:
505; GFX7:       ; %bb.0: ; %entry
506; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
507; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
508; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX7-NEXT:    v_mov_b32_e32 v2, s2
510; GFX7-NEXT:    v_mov_b32_e32 v0, s0
511; GFX7-NEXT:    v_mov_b32_e32 v1, s1
512; GFX7-NEXT:    flat_store_dword v[0:1], v2
513; GFX7-NEXT:    s_endpgm
514;
515; GFX10-WGP-LABEL: global_system_monotonic_store:
516; GFX10-WGP:       ; %bb.0: ; %entry
517; GFX10-WGP-NEXT:    s_clause 0x1
518; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
519; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
520; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
521; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
523; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
524; GFX10-WGP-NEXT:    s_endpgm
525;
526; GFX10-CU-LABEL: global_system_monotonic_store:
527; GFX10-CU:       ; %bb.0: ; %entry
528; GFX10-CU-NEXT:    s_clause 0x1
529; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
530; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
531; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
532; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
534; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
535; GFX10-CU-NEXT:    s_endpgm
536;
537; SKIP-CACHE-INV-LABEL: global_system_monotonic_store:
538; SKIP-CACHE-INV:       ; %bb.0: ; %entry
539; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
540; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
541; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
542; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
543; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
544; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
545; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
546; SKIP-CACHE-INV-NEXT:    s_endpgm
547;
548; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store:
549; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
550; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
552; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
553; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
555; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
556; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
557;
558; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store:
559; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
560; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
561; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
562; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
563; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
564; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
565; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
566; GFX90A-TGSPLIT-NEXT:    s_endpgm
567    i32 %in, i32 addrspace(1)* %out) {
568entry:
569  store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4
570  ret void
571}
572
573define amdgpu_kernel void @global_system_release_store(
574; GFX6-LABEL: global_system_release_store:
575; GFX6:       ; %bb.0: ; %entry
576; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
577; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
578; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
579; GFX6-NEXT:    s_mov_b32 s2, -1
580; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
581; GFX6-NEXT:    v_mov_b32_e32 v0, s6
582; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
583; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
584; GFX6-NEXT:    s_endpgm
585;
586; GFX7-LABEL: global_system_release_store:
587; GFX7:       ; %bb.0: ; %entry
588; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
589; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
590; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX7-NEXT:    v_mov_b32_e32 v2, s2
592; GFX7-NEXT:    v_mov_b32_e32 v0, s0
593; GFX7-NEXT:    v_mov_b32_e32 v1, s1
594; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
595; GFX7-NEXT:    flat_store_dword v[0:1], v2
596; GFX7-NEXT:    s_endpgm
597;
598; GFX10-WGP-LABEL: global_system_release_store:
599; GFX10-WGP:       ; %bb.0: ; %entry
600; GFX10-WGP-NEXT:    s_clause 0x1
601; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
602; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
603; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
604; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
606; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
607; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
608; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
609; GFX10-WGP-NEXT:    s_endpgm
610;
611; GFX10-CU-LABEL: global_system_release_store:
612; GFX10-CU:       ; %bb.0: ; %entry
613; GFX10-CU-NEXT:    s_clause 0x1
614; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
615; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
616; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
617; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
618; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
619; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
620; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
621; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
622; GFX10-CU-NEXT:    s_endpgm
623;
624; SKIP-CACHE-INV-LABEL: global_system_release_store:
625; SKIP-CACHE-INV:       ; %bb.0: ; %entry
626; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
627; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
628; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
630; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
631; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
632; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
633; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
634; SKIP-CACHE-INV-NEXT:    s_endpgm
635;
636; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store:
637; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
638; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
639; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
640; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
641; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
643; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
644; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
645; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
646; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
647;
648; GFX90A-TGSPLIT-LABEL: global_system_release_store:
649; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
650; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
651; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
652; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
653; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
655; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
656; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
657; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
658; GFX90A-TGSPLIT-NEXT:    s_endpgm
659    i32 %in, i32 addrspace(1)* %out) {
660entry:
661  store atomic i32 %in, i32 addrspace(1)* %out release, align 4
662  ret void
663}
664
665define amdgpu_kernel void @global_system_seq_cst_store(
666; GFX6-LABEL: global_system_seq_cst_store:
667; GFX6:       ; %bb.0: ; %entry
668; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
669; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
670; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
671; GFX6-NEXT:    s_mov_b32 s2, -1
672; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX6-NEXT:    v_mov_b32_e32 v0, s6
674; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
675; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
676; GFX6-NEXT:    s_endpgm
677;
678; GFX7-LABEL: global_system_seq_cst_store:
679; GFX7:       ; %bb.0: ; %entry
680; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
681; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
682; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX7-NEXT:    v_mov_b32_e32 v2, s2
684; GFX7-NEXT:    v_mov_b32_e32 v0, s0
685; GFX7-NEXT:    v_mov_b32_e32 v1, s1
686; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
687; GFX7-NEXT:    flat_store_dword v[0:1], v2
688; GFX7-NEXT:    s_endpgm
689;
690; GFX10-WGP-LABEL: global_system_seq_cst_store:
691; GFX10-WGP:       ; %bb.0: ; %entry
692; GFX10-WGP-NEXT:    s_clause 0x1
693; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
694; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
695; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
696; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
698; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
700; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
701; GFX10-WGP-NEXT:    s_endpgm
702;
703; GFX10-CU-LABEL: global_system_seq_cst_store:
704; GFX10-CU:       ; %bb.0: ; %entry
705; GFX10-CU-NEXT:    s_clause 0x1
706; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
707; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
708; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
709; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
711; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
712; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
713; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
714; GFX10-CU-NEXT:    s_endpgm
715;
716; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store:
717; SKIP-CACHE-INV:       ; %bb.0: ; %entry
718; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
719; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
720; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
721; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
722; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
723; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
724; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
725; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
726; SKIP-CACHE-INV-NEXT:    s_endpgm
727;
728; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
729; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
730; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
731; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
732; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
733; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
735; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
736; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
737; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
738; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
739;
740; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store:
741; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
742; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
743; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
744; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
745; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
746; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
747; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
748; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
749; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
750; GFX90A-TGSPLIT-NEXT:    s_endpgm
751    i32 %in, i32 addrspace(1)* %out) {
752entry:
753  store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
754  ret void
755}
756
757define amdgpu_kernel void @global_system_monotonic_atomicrmw(
758; GFX6-LABEL: global_system_monotonic_atomicrmw:
759; GFX6:       ; %bb.0: ; %entry
760; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
761; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
762; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
763; GFX6-NEXT:    s_mov_b32 s2, -1
764; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX6-NEXT:    v_mov_b32_e32 v0, s4
766; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
767; GFX6-NEXT:    s_endpgm
768;
769; GFX7-LABEL: global_system_monotonic_atomicrmw:
770; GFX7:       ; %bb.0: ; %entry
771; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
772; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
773; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX7-NEXT:    v_mov_b32_e32 v0, s0
775; GFX7-NEXT:    v_mov_b32_e32 v1, s1
776; GFX7-NEXT:    v_mov_b32_e32 v2, s2
777; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
778; GFX7-NEXT:    s_endpgm
779;
780; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw:
781; GFX10-WGP:       ; %bb.0: ; %entry
782; GFX10-WGP-NEXT:    s_clause 0x1
783; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
784; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
785; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
786; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
788; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
789; GFX10-WGP-NEXT:    s_endpgm
790;
791; GFX10-CU-LABEL: global_system_monotonic_atomicrmw:
792; GFX10-CU:       ; %bb.0: ; %entry
793; GFX10-CU-NEXT:    s_clause 0x1
794; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
795; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
796; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
797; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
798; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
799; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
800; GFX10-CU-NEXT:    s_endpgm
801;
802; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw:
803; SKIP-CACHE-INV:       ; %bb.0: ; %entry
804; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
805; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
806; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
807; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
808; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
809; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
810; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
811; SKIP-CACHE-INV-NEXT:    s_endpgm
812;
813; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
814; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
815; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
816; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
817; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
818; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
820; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
821; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
822;
823; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
824; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
825; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
826; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
827; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
828; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
830; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
831; GFX90A-TGSPLIT-NEXT:    s_endpgm
832    i32 addrspace(1)* %out, i32 %in) {
833entry:
834  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic
835  ret void
836}
837
838define amdgpu_kernel void @global_system_acquire_atomicrmw(
839; GFX6-LABEL: global_system_acquire_atomicrmw:
840; GFX6:       ; %bb.0: ; %entry
841; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
842; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
843; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
844; GFX6-NEXT:    s_mov_b32 s2, -1
845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX6-NEXT:    v_mov_b32_e32 v0, s4
847; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
848; GFX6-NEXT:    s_waitcnt vmcnt(0)
849; GFX6-NEXT:    buffer_wbinvl1
850; GFX6-NEXT:    s_endpgm
851;
852; GFX7-LABEL: global_system_acquire_atomicrmw:
853; GFX7:       ; %bb.0: ; %entry
854; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
855; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
856; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX7-NEXT:    v_mov_b32_e32 v0, s0
858; GFX7-NEXT:    v_mov_b32_e32 v1, s1
859; GFX7-NEXT:    v_mov_b32_e32 v2, s2
860; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
861; GFX7-NEXT:    s_waitcnt vmcnt(0)
862; GFX7-NEXT:    buffer_wbinvl1_vol
863; GFX7-NEXT:    s_endpgm
864;
865; GFX10-WGP-LABEL: global_system_acquire_atomicrmw:
866; GFX10-WGP:       ; %bb.0: ; %entry
867; GFX10-WGP-NEXT:    s_clause 0x1
868; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
869; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
870; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
871; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
873; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
874; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
875; GFX10-WGP-NEXT:    buffer_gl0_inv
876; GFX10-WGP-NEXT:    buffer_gl1_inv
877; GFX10-WGP-NEXT:    s_endpgm
878;
879; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
880; GFX10-CU:       ; %bb.0: ; %entry
881; GFX10-CU-NEXT:    s_clause 0x1
882; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
883; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
884; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
885; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
887; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
888; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
889; GFX10-CU-NEXT:    buffer_gl0_inv
890; GFX10-CU-NEXT:    buffer_gl1_inv
891; GFX10-CU-NEXT:    s_endpgm
892;
893; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
894; SKIP-CACHE-INV:       ; %bb.0: ; %entry
895; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
896; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
897; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
898; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
899; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
900; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
901; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
902; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
903; SKIP-CACHE-INV-NEXT:    s_endpgm
904;
905; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
906; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
907; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
909; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
910; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
911; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
912; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
913; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
914; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
915; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
916; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
917;
918; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
919; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
920; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
921; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
922; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
923; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
925; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
926; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
927; GFX90A-TGSPLIT-NEXT:    buffer_invl2
928; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
929; GFX90A-TGSPLIT-NEXT:    s_endpgm
930    i32 addrspace(1)* %out, i32 %in) {
931entry:
932  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
933  ret void
934}
935
936define amdgpu_kernel void @global_system_release_atomicrmw(
937; GFX6-LABEL: global_system_release_atomicrmw:
938; GFX6:       ; %bb.0: ; %entry
939; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
940; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
941; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
942; GFX6-NEXT:    s_mov_b32 s2, -1
943; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX6-NEXT:    v_mov_b32_e32 v0, s4
945; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
946; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
947; GFX6-NEXT:    s_endpgm
948;
949; GFX7-LABEL: global_system_release_atomicrmw:
950; GFX7:       ; %bb.0: ; %entry
951; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
952; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
953; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX7-NEXT:    v_mov_b32_e32 v0, s0
955; GFX7-NEXT:    v_mov_b32_e32 v1, s1
956; GFX7-NEXT:    v_mov_b32_e32 v2, s2
957; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
958; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
959; GFX7-NEXT:    s_endpgm
960;
961; GFX10-WGP-LABEL: global_system_release_atomicrmw:
962; GFX10-WGP:       ; %bb.0: ; %entry
963; GFX10-WGP-NEXT:    s_clause 0x1
964; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
965; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
966; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
967; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
969; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
970; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
971; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
972; GFX10-WGP-NEXT:    s_endpgm
973;
974; GFX10-CU-LABEL: global_system_release_atomicrmw:
975; GFX10-CU:       ; %bb.0: ; %entry
976; GFX10-CU-NEXT:    s_clause 0x1
977; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
978; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
979; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
980; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
981; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
982; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
983; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
984; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
985; GFX10-CU-NEXT:    s_endpgm
986;
987; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw:
988; SKIP-CACHE-INV:       ; %bb.0: ; %entry
989; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
990; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
991; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
992; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
993; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
994; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
995; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
996; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
997; SKIP-CACHE-INV-NEXT:    s_endpgm
998;
999; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
1000; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1001; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1002; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1003; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1004; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1005; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1006; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1007; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1008; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1009; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1010;
1011; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw:
1012; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1013; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1014; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1015; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1016; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1018; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1019; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1020; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1021; GFX90A-TGSPLIT-NEXT:    s_endpgm
1022    i32 addrspace(1)* %out, i32 %in) {
1023entry:
1024  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release
1025  ret void
1026}
1027
1028define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
1029; GFX6-LABEL: global_system_acq_rel_atomicrmw:
1030; GFX6:       ; %bb.0: ; %entry
1031; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1032; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1033; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1034; GFX6-NEXT:    s_mov_b32 s2, -1
1035; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1037; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1038; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1039; GFX6-NEXT:    s_waitcnt vmcnt(0)
1040; GFX6-NEXT:    buffer_wbinvl1
1041; GFX6-NEXT:    s_endpgm
1042;
1043; GFX7-LABEL: global_system_acq_rel_atomicrmw:
1044; GFX7:       ; %bb.0: ; %entry
1045; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1046; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1047; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1049; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1050; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1051; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1052; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1053; GFX7-NEXT:    s_waitcnt vmcnt(0)
1054; GFX7-NEXT:    buffer_wbinvl1_vol
1055; GFX7-NEXT:    s_endpgm
1056;
1057; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw:
1058; GFX10-WGP:       ; %bb.0: ; %entry
1059; GFX10-WGP-NEXT:    s_clause 0x1
1060; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1061; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1062; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1063; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1065; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1066; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1067; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
1068; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1069; GFX10-WGP-NEXT:    buffer_gl0_inv
1070; GFX10-WGP-NEXT:    buffer_gl1_inv
1071; GFX10-WGP-NEXT:    s_endpgm
1072;
1073; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
1074; GFX10-CU:       ; %bb.0: ; %entry
1075; GFX10-CU-NEXT:    s_clause 0x1
1076; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1077; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1079; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1080; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1081; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1083; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
1084; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1085; GFX10-CU-NEXT:    buffer_gl0_inv
1086; GFX10-CU-NEXT:    buffer_gl1_inv
1087; GFX10-CU-NEXT:    s_endpgm
1088;
1089; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
1090; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1091; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1092; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1093; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1094; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1095; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1097; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1099; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1100; SKIP-CACHE-INV-NEXT:    s_endpgm
1101;
1102; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
1103; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1104; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1105; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1106; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1107; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1109; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1110; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1111; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1113; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1114; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1115; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1116;
1117; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
1118; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1119; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1120; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1121; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1122; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1124; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1125; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1126; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1127; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1128; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1129; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1130; GFX90A-TGSPLIT-NEXT:    s_endpgm
1131    i32 addrspace(1)* %out, i32 %in) {
1132entry:
1133  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
1134  ret void
1135}
1136
1137define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
1138; GFX6-LABEL: global_system_seq_cst_atomicrmw:
1139; GFX6:       ; %bb.0: ; %entry
1140; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1141; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1142; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1143; GFX6-NEXT:    s_mov_b32 s2, -1
1144; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1146; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1147; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1148; GFX6-NEXT:    s_waitcnt vmcnt(0)
1149; GFX6-NEXT:    buffer_wbinvl1
1150; GFX6-NEXT:    s_endpgm
1151;
1152; GFX7-LABEL: global_system_seq_cst_atomicrmw:
1153; GFX7:       ; %bb.0: ; %entry
1154; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1155; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1156; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1158; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1159; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1160; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1161; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1162; GFX7-NEXT:    s_waitcnt vmcnt(0)
1163; GFX7-NEXT:    buffer_wbinvl1_vol
1164; GFX7-NEXT:    s_endpgm
1165;
1166; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw:
1167; GFX10-WGP:       ; %bb.0: ; %entry
1168; GFX10-WGP-NEXT:    s_clause 0x1
1169; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1170; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1171; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1172; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1174; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1175; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1176; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
1177; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1178; GFX10-WGP-NEXT:    buffer_gl0_inv
1179; GFX10-WGP-NEXT:    buffer_gl1_inv
1180; GFX10-WGP-NEXT:    s_endpgm
1181;
1182; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
1183; GFX10-CU:       ; %bb.0: ; %entry
1184; GFX10-CU-NEXT:    s_clause 0x1
1185; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1186; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1187; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1188; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1190; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1191; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1192; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
1193; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1194; GFX10-CU-NEXT:    buffer_gl0_inv
1195; GFX10-CU-NEXT:    buffer_gl1_inv
1196; GFX10-CU-NEXT:    s_endpgm
1197;
1198; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
1199; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1200; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1201; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1202; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1203; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1204; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1206; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1207; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1208; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1209; SKIP-CACHE-INV-NEXT:    s_endpgm
1210;
1211; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
1212; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1213; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1214; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1215; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1216; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1217; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1218; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1219; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1220; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1221; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1222; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1223; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1224; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1225;
1226; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
1227; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1228; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1229; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1230; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1231; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1232; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1233; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1234; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1235; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1236; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1237; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1238; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1239; GFX90A-TGSPLIT-NEXT:    s_endpgm
1240    i32 addrspace(1)* %out, i32 %in) {
1241entry:
1242  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
1243  ret void
1244}
1245
1246define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
1247; GFX6-LABEL: global_system_acquire_ret_atomicrmw:
1248; GFX6:       ; %bb.0: ; %entry
1249; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1250; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1251; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1252; GFX6-NEXT:    s_mov_b32 s2, -1
1253; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1254; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1255; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1256; GFX6-NEXT:    s_waitcnt vmcnt(0)
1257; GFX6-NEXT:    buffer_wbinvl1
1258; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1259; GFX6-NEXT:    s_endpgm
1260;
1261; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
1262; GFX7:       ; %bb.0: ; %entry
1263; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1264; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1265; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1267; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1268; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1269; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1270; GFX7-NEXT:    s_waitcnt vmcnt(0)
1271; GFX7-NEXT:    buffer_wbinvl1_vol
1272; GFX7-NEXT:    flat_store_dword v[0:1], v2
1273; GFX7-NEXT:    s_endpgm
1274;
1275; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw:
1276; GFX10-WGP:       ; %bb.0: ; %entry
1277; GFX10-WGP-NEXT:    s_clause 0x1
1278; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1279; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1280; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1281; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1282; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1283; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1284; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1285; GFX10-WGP-NEXT:    buffer_gl0_inv
1286; GFX10-WGP-NEXT:    buffer_gl1_inv
1287; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1288; GFX10-WGP-NEXT:    s_endpgm
1289;
1290; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw:
1291; GFX10-CU:       ; %bb.0: ; %entry
1292; GFX10-CU-NEXT:    s_clause 0x1
1293; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1294; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1296; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1298; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1299; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1300; GFX10-CU-NEXT:    buffer_gl0_inv
1301; GFX10-CU-NEXT:    buffer_gl1_inv
1302; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1303; GFX10-CU-NEXT:    s_endpgm
1304;
1305; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw:
1306; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1307; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1308; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1309; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1310; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1311; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1313; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1314; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1315; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1316; SKIP-CACHE-INV-NEXT:    s_endpgm
1317;
1318; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
1319; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1320; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1321; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1322; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1323; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1324; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1325; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1326; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1327; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1328; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1329; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1330; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1331;
1332; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
1333; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1334; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1335; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1336; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1337; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1339; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1340; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1341; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1342; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1343; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1344; GFX90A-TGSPLIT-NEXT:    s_endpgm
1345    i32 addrspace(1)* %out, i32 %in) {
1346entry:
1347  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
1348  store i32 %val, i32 addrspace(1)* %out, align 4
1349  ret void
1350}
1351
1352define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
1353; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw:
1354; GFX6:       ; %bb.0: ; %entry
1355; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1356; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1357; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1358; GFX6-NEXT:    s_mov_b32 s2, -1
1359; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1361; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1362; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1363; GFX6-NEXT:    s_waitcnt vmcnt(0)
1364; GFX6-NEXT:    buffer_wbinvl1
1365; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1366; GFX6-NEXT:    s_endpgm
1367;
1368; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
1369; GFX7:       ; %bb.0: ; %entry
1370; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1371; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1372; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1374; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1375; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1376; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1377; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1378; GFX7-NEXT:    s_waitcnt vmcnt(0)
1379; GFX7-NEXT:    buffer_wbinvl1_vol
1380; GFX7-NEXT:    flat_store_dword v[0:1], v2
1381; GFX7-NEXT:    s_endpgm
1382;
1383; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
1384; GFX10-WGP:       ; %bb.0: ; %entry
1385; GFX10-WGP-NEXT:    s_clause 0x1
1386; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1387; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1388; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1389; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1391; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1392; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1393; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1394; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1395; GFX10-WGP-NEXT:    buffer_gl0_inv
1396; GFX10-WGP-NEXT:    buffer_gl1_inv
1397; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1398; GFX10-WGP-NEXT:    s_endpgm
1399;
1400; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
1401; GFX10-CU:       ; %bb.0: ; %entry
1402; GFX10-CU-NEXT:    s_clause 0x1
1403; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1404; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1405; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1406; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1408; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1409; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1410; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1411; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1412; GFX10-CU-NEXT:    buffer_gl0_inv
1413; GFX10-CU-NEXT:    buffer_gl1_inv
1414; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1415; GFX10-CU-NEXT:    s_endpgm
1416;
1417; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw:
1418; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1419; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1420; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1421; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1422; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1423; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1424; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1425; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1426; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1427; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1428; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1429; SKIP-CACHE-INV-NEXT:    s_endpgm
1430;
1431; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
1432; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1433; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1434; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1435; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1436; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1438; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1439; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1440; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1441; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1442; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1443; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1444; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1445; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1446;
1447; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
1448; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1449; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1450; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1451; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1452; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1454; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1455; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1456; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1457; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1458; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1459; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1460; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1461; GFX90A-TGSPLIT-NEXT:    s_endpgm
1462    i32 addrspace(1)* %out, i32 %in) {
1463entry:
1464  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
1465  store i32 %val, i32 addrspace(1)* %out, align 4
1466  ret void
1467}
1468
1469define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
1470; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw:
1471; GFX6:       ; %bb.0: ; %entry
1472; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1473; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
1474; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1475; GFX6-NEXT:    s_mov_b32 s2, -1
1476; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1478; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1479; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
1480; GFX6-NEXT:    s_waitcnt vmcnt(0)
1481; GFX6-NEXT:    buffer_wbinvl1
1482; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1483; GFX6-NEXT:    s_endpgm
1484;
1485; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
1486; GFX7:       ; %bb.0: ; %entry
1487; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1488; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1489; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1491; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1492; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1493; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1494; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1495; GFX7-NEXT:    s_waitcnt vmcnt(0)
1496; GFX7-NEXT:    buffer_wbinvl1_vol
1497; GFX7-NEXT:    flat_store_dword v[0:1], v2
1498; GFX7-NEXT:    s_endpgm
1499;
1500; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
1501; GFX10-WGP:       ; %bb.0: ; %entry
1502; GFX10-WGP-NEXT:    s_clause 0x1
1503; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1504; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1505; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1506; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1508; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1509; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1510; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1511; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
1512; GFX10-WGP-NEXT:    buffer_gl0_inv
1513; GFX10-WGP-NEXT:    buffer_gl1_inv
1514; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1515; GFX10-WGP-NEXT:    s_endpgm
1516;
1517; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
1518; GFX10-CU:       ; %bb.0: ; %entry
1519; GFX10-CU-NEXT:    s_clause 0x1
1520; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1521; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1522; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1523; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1525; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1526; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1527; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1528; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
1529; GFX10-CU-NEXT:    buffer_gl0_inv
1530; GFX10-CU-NEXT:    buffer_gl1_inv
1531; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1532; GFX10-CU-NEXT:    s_endpgm
1533;
1534; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw:
1535; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1536; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1537; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1539; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1540; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1541; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1542; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1543; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1544; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1545; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1546; SKIP-CACHE-INV-NEXT:    s_endpgm
1547;
1548; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
1549; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1550; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1552; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1553; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1555; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1556; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1557; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1558; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1559; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1560; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1561; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1562; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1563;
1564; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
1565; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1566; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1567; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1568; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1569; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1570; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1571; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1572; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1573; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1574; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1575; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1576; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1577; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
1578; GFX90A-TGSPLIT-NEXT:    s_endpgm
1579    i32 addrspace(1)* %out, i32 %in) {
1580entry:
1581  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
1582  store i32 %val, i32 addrspace(1)* %out, align 4
1583  ret void
1584}
1585
1586define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
1587; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg:
1588; GFX6:       ; %bb.0: ; %entry
1589; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1590; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1591; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1592; GFX6-NEXT:    s_mov_b32 s2, -1
1593; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1595; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1596; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1597; GFX6-NEXT:    s_endpgm
1598;
1599; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
1600; GFX7:       ; %bb.0: ; %entry
1601; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1602; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1603; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1604; GFX7-NEXT:    s_add_u32 s0, s0, 16
1605; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1606; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1607; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1608; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1609; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1610; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1611; GFX7-NEXT:    s_endpgm
1612;
1613; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
1614; GFX10-WGP:       ; %bb.0: ; %entry
1615; GFX10-WGP-NEXT:    s_clause 0x1
1616; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1617; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1618; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1619; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1620; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1621; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1622; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1623; GFX10-WGP-NEXT:    s_endpgm
1624;
1625; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
1626; GFX10-CU:       ; %bb.0: ; %entry
1627; GFX10-CU-NEXT:    s_clause 0x1
1628; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1629; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1630; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1631; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1633; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1634; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1635; GFX10-CU-NEXT:    s_endpgm
1636;
1637; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg:
1638; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1639; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1640; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1641; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1642; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1643; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1646; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1647; SKIP-CACHE-INV-NEXT:    s_endpgm
1648;
1649; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
1650; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1651; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1652; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1653; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1654; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1656; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1657; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1658;
1659; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
1660; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1661; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1662; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1663; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1664; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1666; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1667; GFX90A-TGSPLIT-NEXT:    s_endpgm
1668    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1669entry:
1670  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1671  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic monotonic
1672  ret void
1673}
1674
1675define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
1676; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg:
1677; GFX6:       ; %bb.0: ; %entry
1678; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1679; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1680; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1681; GFX6-NEXT:    s_mov_b32 s2, -1
1682; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1684; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1685; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1686; GFX6-NEXT:    s_waitcnt vmcnt(0)
1687; GFX6-NEXT:    buffer_wbinvl1
1688; GFX6-NEXT:    s_endpgm
1689;
1690; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
1691; GFX7:       ; %bb.0: ; %entry
1692; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1693; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1694; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX7-NEXT:    s_add_u32 s0, s0, 16
1696; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1697; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1698; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1699; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1700; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1701; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1702; GFX7-NEXT:    s_waitcnt vmcnt(0)
1703; GFX7-NEXT:    buffer_wbinvl1_vol
1704; GFX7-NEXT:    s_endpgm
1705;
1706; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
1707; GFX10-WGP:       ; %bb.0: ; %entry
1708; GFX10-WGP-NEXT:    s_clause 0x1
1709; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1710; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1711; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1712; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1714; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1715; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1716; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1717; GFX10-WGP-NEXT:    buffer_gl0_inv
1718; GFX10-WGP-NEXT:    buffer_gl1_inv
1719; GFX10-WGP-NEXT:    s_endpgm
1720;
1721; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
1722; GFX10-CU:       ; %bb.0: ; %entry
1723; GFX10-CU-NEXT:    s_clause 0x1
1724; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1725; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1726; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1727; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1729; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1730; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1731; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1732; GFX10-CU-NEXT:    buffer_gl0_inv
1733; GFX10-CU-NEXT:    buffer_gl1_inv
1734; GFX10-CU-NEXT:    s_endpgm
1735;
1736; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
1737; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1738; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1739; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1740; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1741; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1742; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1743; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1744; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1745; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1746; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1747; SKIP-CACHE-INV-NEXT:    s_endpgm
1748;
1749; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
1750; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1751; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1752; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1753; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1754; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1756; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1757; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1758; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1759; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1760; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1761;
1762; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
1763; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1764; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1765; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1766; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1767; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1769; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1770; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1771; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1772; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1773; GFX90A-TGSPLIT-NEXT:    s_endpgm
1774    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1775entry:
1776  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1777  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic
1778  ret void
1779}
1780
1781define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
1782; GFX6-LABEL: global_system_release_monotonic_cmpxchg:
1783; GFX6:       ; %bb.0: ; %entry
1784; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1785; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1786; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1787; GFX6-NEXT:    s_mov_b32 s2, -1
1788; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1790; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1791; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1792; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1793; GFX6-NEXT:    s_endpgm
1794;
1795; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
1796; GFX7:       ; %bb.0: ; %entry
1797; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1798; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1799; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX7-NEXT:    s_add_u32 s0, s0, 16
1801; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1802; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1803; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1804; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1805; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1806; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1807; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1808; GFX7-NEXT:    s_endpgm
1809;
1810; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg:
1811; GFX10-WGP:       ; %bb.0: ; %entry
1812; GFX10-WGP-NEXT:    s_clause 0x1
1813; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1814; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1815; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1816; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1817; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1818; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1819; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1820; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1821; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1822; GFX10-WGP-NEXT:    s_endpgm
1823;
1824; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg:
1825; GFX10-CU:       ; %bb.0: ; %entry
1826; GFX10-CU-NEXT:    s_clause 0x1
1827; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1828; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1829; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1830; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1831; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1832; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1833; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1834; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1835; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1836; GFX10-CU-NEXT:    s_endpgm
1837;
1838; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg:
1839; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1840; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1841; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1842; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1843; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1844; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1845; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1846; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1847; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1848; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1849; SKIP-CACHE-INV-NEXT:    s_endpgm
1850;
1851; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
1852; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1853; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1854; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1855; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1856; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1857; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1858; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1859; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1860; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1861; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1862;
1863; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
1864; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1865; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1866; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1867; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1868; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1869; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1870; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1871; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1872; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1873; GFX90A-TGSPLIT-NEXT:    s_endpgm
1874    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1875entry:
1876  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1877  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release monotonic
1878  ret void
1879}
1880
1881define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
1882; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1883; GFX6:       ; %bb.0: ; %entry
1884; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1885; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1886; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
1887; GFX6-NEXT:    s_mov_b32 s2, -1
1888; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1890; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1891; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1892; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
1893; GFX6-NEXT:    s_waitcnt vmcnt(0)
1894; GFX6-NEXT:    buffer_wbinvl1
1895; GFX6-NEXT:    s_endpgm
1896;
1897; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1898; GFX7:       ; %bb.0: ; %entry
1899; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1900; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1901; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1902; GFX7-NEXT:    s_add_u32 s0, s0, 16
1903; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1904; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1905; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1906; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1907; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1908; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1909; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1910; GFX7-NEXT:    s_waitcnt vmcnt(0)
1911; GFX7-NEXT:    buffer_wbinvl1_vol
1912; GFX7-NEXT:    s_endpgm
1913;
1914; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1915; GFX10-WGP:       ; %bb.0: ; %entry
1916; GFX10-WGP-NEXT:    s_clause 0x1
1917; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1918; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1919; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1920; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1922; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1923; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1924; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1925; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1926; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1927; GFX10-WGP-NEXT:    buffer_gl0_inv
1928; GFX10-WGP-NEXT:    buffer_gl1_inv
1929; GFX10-WGP-NEXT:    s_endpgm
1930;
1931; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1932; GFX10-CU:       ; %bb.0: ; %entry
1933; GFX10-CU-NEXT:    s_clause 0x1
1934; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1935; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1936; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1937; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1939; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1940; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1941; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1942; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1943; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1944; GFX10-CU-NEXT:    buffer_gl0_inv
1945; GFX10-CU-NEXT:    buffer_gl1_inv
1946; GFX10-CU-NEXT:    s_endpgm
1947;
1948; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1949; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1950; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1951; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1952; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1953; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1954; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1955; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1956; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1957; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1958; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1959; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
1960; SKIP-CACHE-INV-NEXT:    s_endpgm
1961;
1962; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1963; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1964; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1965; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1966; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1967; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1969; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1970; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1971; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1972; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1973; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1974; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1975; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1976;
1977; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1978; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1979; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1980; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1981; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
1982; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1983; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1984; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1985; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1986; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
1987; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1988; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1989; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1990; GFX90A-TGSPLIT-NEXT:    s_endpgm
1991    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1992entry:
1993  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1994  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic
1995  ret void
1996}
1997
1998define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
1999; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2000; GFX6:       ; %bb.0: ; %entry
2001; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2002; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2003; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2004; GFX6-NEXT:    s_mov_b32 s2, -1
2005; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2006; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2007; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2008; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2009; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2010; GFX6-NEXT:    s_waitcnt vmcnt(0)
2011; GFX6-NEXT:    buffer_wbinvl1
2012; GFX6-NEXT:    s_endpgm
2013;
2014; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2015; GFX7:       ; %bb.0: ; %entry
2016; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2017; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2018; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX7-NEXT:    s_add_u32 s0, s0, 16
2020; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2021; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2022; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2023; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2024; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2025; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2026; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2027; GFX7-NEXT:    s_waitcnt vmcnt(0)
2028; GFX7-NEXT:    buffer_wbinvl1_vol
2029; GFX7-NEXT:    s_endpgm
2030;
2031; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2032; GFX10-WGP:       ; %bb.0: ; %entry
2033; GFX10-WGP-NEXT:    s_clause 0x1
2034; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2035; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2036; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2037; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2038; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2039; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2040; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2041; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2042; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2043; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2044; GFX10-WGP-NEXT:    buffer_gl0_inv
2045; GFX10-WGP-NEXT:    buffer_gl1_inv
2046; GFX10-WGP-NEXT:    s_endpgm
2047;
2048; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2049; GFX10-CU:       ; %bb.0: ; %entry
2050; GFX10-CU-NEXT:    s_clause 0x1
2051; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2052; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2053; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2054; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2055; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2056; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2057; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2058; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2059; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2060; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2061; GFX10-CU-NEXT:    buffer_gl0_inv
2062; GFX10-CU-NEXT:    buffer_gl1_inv
2063; GFX10-CU-NEXT:    s_endpgm
2064;
2065; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2066; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2067; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2068; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2069; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2070; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2071; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2074; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2075; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2076; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2077; SKIP-CACHE-INV-NEXT:    s_endpgm
2078;
2079; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2080; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2082; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2083; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2084; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2086; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2087; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2088; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2089; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2090; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2091; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2092; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2093;
2094; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
2095; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2096; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2097; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2098; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2099; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2100; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2101; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2102; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2103; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2104; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2105; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2106; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2107; GFX90A-TGSPLIT-NEXT:    s_endpgm
2108    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2109entry:
2110  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2111  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic
2112  ret void
2113}
2114
2115define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
2116; GFX6-LABEL: global_system_acquire_acquire_cmpxchg:
2117; GFX6:       ; %bb.0: ; %entry
2118; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2119; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2120; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2121; GFX6-NEXT:    s_mov_b32 s2, -1
2122; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2123; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2124; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2125; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2126; GFX6-NEXT:    s_waitcnt vmcnt(0)
2127; GFX6-NEXT:    buffer_wbinvl1
2128; GFX6-NEXT:    s_endpgm
2129;
2130; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
2131; GFX7:       ; %bb.0: ; %entry
2132; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2133; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2134; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2135; GFX7-NEXT:    s_add_u32 s0, s0, 16
2136; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2137; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2138; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2139; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2140; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2141; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2142; GFX7-NEXT:    s_waitcnt vmcnt(0)
2143; GFX7-NEXT:    buffer_wbinvl1_vol
2144; GFX7-NEXT:    s_endpgm
2145;
2146; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
2147; GFX10-WGP:       ; %bb.0: ; %entry
2148; GFX10-WGP-NEXT:    s_clause 0x1
2149; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2150; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2151; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2152; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2153; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2154; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2155; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2156; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2157; GFX10-WGP-NEXT:    buffer_gl0_inv
2158; GFX10-WGP-NEXT:    buffer_gl1_inv
2159; GFX10-WGP-NEXT:    s_endpgm
2160;
2161; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
2162; GFX10-CU:       ; %bb.0: ; %entry
2163; GFX10-CU-NEXT:    s_clause 0x1
2164; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2165; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2166; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2167; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2169; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2170; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2171; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2172; GFX10-CU-NEXT:    buffer_gl0_inv
2173; GFX10-CU-NEXT:    buffer_gl1_inv
2174; GFX10-CU-NEXT:    s_endpgm
2175;
2176; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
2177; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2178; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2179; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2180; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2181; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2182; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2184; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2185; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2186; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2187; SKIP-CACHE-INV-NEXT:    s_endpgm
2188;
2189; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
2190; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2191; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2192; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2193; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2194; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2195; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2196; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2197; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2198; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2199; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2200; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2201;
2202; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
2203; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2204; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2205; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2206; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2207; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2209; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2210; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2211; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2212; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2213; GFX90A-TGSPLIT-NEXT:    s_endpgm
2214    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2215entry:
2216  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2217  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire
2218  ret void
2219}
2220
2221define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
2222; GFX6-LABEL: global_system_release_acquire_cmpxchg:
2223; GFX6:       ; %bb.0: ; %entry
2224; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2225; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2226; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2227; GFX6-NEXT:    s_mov_b32 s2, -1
2228; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2229; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2230; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2231; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2232; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2233; GFX6-NEXT:    s_waitcnt vmcnt(0)
2234; GFX6-NEXT:    buffer_wbinvl1
2235; GFX6-NEXT:    s_endpgm
2236;
2237; GFX7-LABEL: global_system_release_acquire_cmpxchg:
2238; GFX7:       ; %bb.0: ; %entry
2239; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2240; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2241; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX7-NEXT:    s_add_u32 s0, s0, 16
2243; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2244; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2245; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2246; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2247; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2248; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2249; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2250; GFX7-NEXT:    s_waitcnt vmcnt(0)
2251; GFX7-NEXT:    buffer_wbinvl1_vol
2252; GFX7-NEXT:    s_endpgm
2253;
2254; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg:
2255; GFX10-WGP:       ; %bb.0: ; %entry
2256; GFX10-WGP-NEXT:    s_clause 0x1
2257; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2258; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2259; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2260; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2262; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2263; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2264; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2265; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2266; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2267; GFX10-WGP-NEXT:    buffer_gl0_inv
2268; GFX10-WGP-NEXT:    buffer_gl1_inv
2269; GFX10-WGP-NEXT:    s_endpgm
2270;
2271; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
2272; GFX10-CU:       ; %bb.0: ; %entry
2273; GFX10-CU-NEXT:    s_clause 0x1
2274; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2275; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2276; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2277; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2278; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2279; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2280; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2281; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2282; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2283; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2284; GFX10-CU-NEXT:    buffer_gl0_inv
2285; GFX10-CU-NEXT:    buffer_gl1_inv
2286; GFX10-CU-NEXT:    s_endpgm
2287;
2288; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
2289; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2290; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2291; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2292; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2293; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2294; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2295; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2297; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2298; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2299; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2300; SKIP-CACHE-INV-NEXT:    s_endpgm
2301;
2302; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
2303; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2304; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2305; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2306; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2307; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2308; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2309; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2310; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2311; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2312; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2313; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2314; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2315; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2316;
2317; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
2318; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2319; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2320; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2321; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2322; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2323; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2324; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2325; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2326; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2327; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2328; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2329; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2330; GFX90A-TGSPLIT-NEXT:    s_endpgm
2331    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2332entry:
2333  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2334  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire
2335  ret void
2336}
2337
2338define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
2339; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg:
2340; GFX6:       ; %bb.0: ; %entry
2341; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2342; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2343; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2344; GFX6-NEXT:    s_mov_b32 s2, -1
2345; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2347; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2348; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2349; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2350; GFX6-NEXT:    s_waitcnt vmcnt(0)
2351; GFX6-NEXT:    buffer_wbinvl1
2352; GFX6-NEXT:    s_endpgm
2353;
2354; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
2355; GFX7:       ; %bb.0: ; %entry
2356; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2357; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX7-NEXT:    s_add_u32 s0, s0, 16
2360; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2361; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2362; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2363; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2364; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2365; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2366; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2367; GFX7-NEXT:    s_waitcnt vmcnt(0)
2368; GFX7-NEXT:    buffer_wbinvl1_vol
2369; GFX7-NEXT:    s_endpgm
2370;
2371; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
2372; GFX10-WGP:       ; %bb.0: ; %entry
2373; GFX10-WGP-NEXT:    s_clause 0x1
2374; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2375; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2376; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2377; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2378; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2379; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2380; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2381; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2382; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2383; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2384; GFX10-WGP-NEXT:    buffer_gl0_inv
2385; GFX10-WGP-NEXT:    buffer_gl1_inv
2386; GFX10-WGP-NEXT:    s_endpgm
2387;
2388; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
2389; GFX10-CU:       ; %bb.0: ; %entry
2390; GFX10-CU-NEXT:    s_clause 0x1
2391; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2392; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2393; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2394; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2395; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2396; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2397; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2398; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2399; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2400; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2401; GFX10-CU-NEXT:    buffer_gl0_inv
2402; GFX10-CU-NEXT:    buffer_gl1_inv
2403; GFX10-CU-NEXT:    s_endpgm
2404;
2405; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
2406; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2407; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2408; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2410; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2411; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2412; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2413; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2414; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2415; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2416; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2417; SKIP-CACHE-INV-NEXT:    s_endpgm
2418;
2419; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
2420; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2421; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2422; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2423; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2424; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2426; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2427; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2428; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2429; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2430; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2431; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2432; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2433;
2434; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
2435; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2436; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2437; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2438; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2439; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2440; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2441; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2442; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2443; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2444; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2445; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2446; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2447; GFX90A-TGSPLIT-NEXT:    s_endpgm
2448    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2449entry:
2450  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2451  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire
2452  ret void
2453}
2454
2455define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
2456; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg:
2457; GFX6:       ; %bb.0: ; %entry
2458; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2459; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2460; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2461; GFX6-NEXT:    s_mov_b32 s2, -1
2462; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2464; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2465; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2466; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2467; GFX6-NEXT:    s_waitcnt vmcnt(0)
2468; GFX6-NEXT:    buffer_wbinvl1
2469; GFX6-NEXT:    s_endpgm
2470;
2471; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
2472; GFX7:       ; %bb.0: ; %entry
2473; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2474; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2475; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2476; GFX7-NEXT:    s_add_u32 s0, s0, 16
2477; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2478; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2479; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2480; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2481; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2482; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2483; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2484; GFX7-NEXT:    s_waitcnt vmcnt(0)
2485; GFX7-NEXT:    buffer_wbinvl1_vol
2486; GFX7-NEXT:    s_endpgm
2487;
2488; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
2489; GFX10-WGP:       ; %bb.0: ; %entry
2490; GFX10-WGP-NEXT:    s_clause 0x1
2491; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2492; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2493; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2494; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2495; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2496; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2497; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2498; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2499; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2500; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2501; GFX10-WGP-NEXT:    buffer_gl0_inv
2502; GFX10-WGP-NEXT:    buffer_gl1_inv
2503; GFX10-WGP-NEXT:    s_endpgm
2504;
2505; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
2506; GFX10-CU:       ; %bb.0: ; %entry
2507; GFX10-CU-NEXT:    s_clause 0x1
2508; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2509; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2510; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2511; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2513; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2514; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2515; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2516; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2517; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2518; GFX10-CU-NEXT:    buffer_gl0_inv
2519; GFX10-CU-NEXT:    buffer_gl1_inv
2520; GFX10-CU-NEXT:    s_endpgm
2521;
2522; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
2523; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2524; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2525; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2526; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2527; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2528; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2529; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2530; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2531; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2532; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2533; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2534; SKIP-CACHE-INV-NEXT:    s_endpgm
2535;
2536; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
2537; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2538; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2539; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2540; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2541; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2543; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2544; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2545; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2546; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2547; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2548; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2549; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2550;
2551; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
2552; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2553; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2554; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2555; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2556; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2557; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2558; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2559; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2560; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2561; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2562; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2563; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2564; GFX90A-TGSPLIT-NEXT:    s_endpgm
2565    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2566entry:
2567  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2568  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire
2569  ret void
2570}
2571
2572define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
2573; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2574; GFX6:       ; %bb.0: ; %entry
2575; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2576; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2577; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2578; GFX6-NEXT:    s_mov_b32 s2, -1
2579; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2580; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2581; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2582; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2583; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2584; GFX6-NEXT:    s_waitcnt vmcnt(0)
2585; GFX6-NEXT:    buffer_wbinvl1
2586; GFX6-NEXT:    s_endpgm
2587;
2588; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2589; GFX7:       ; %bb.0: ; %entry
2590; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2591; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2592; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2593; GFX7-NEXT:    s_add_u32 s0, s0, 16
2594; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2595; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2596; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2597; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2598; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2599; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2600; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2601; GFX7-NEXT:    s_waitcnt vmcnt(0)
2602; GFX7-NEXT:    buffer_wbinvl1_vol
2603; GFX7-NEXT:    s_endpgm
2604;
2605; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2606; GFX10-WGP:       ; %bb.0: ; %entry
2607; GFX10-WGP-NEXT:    s_clause 0x1
2608; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2609; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2610; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2611; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2612; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2613; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2614; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2615; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2616; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2617; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2618; GFX10-WGP-NEXT:    buffer_gl0_inv
2619; GFX10-WGP-NEXT:    buffer_gl1_inv
2620; GFX10-WGP-NEXT:    s_endpgm
2621;
2622; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2623; GFX10-CU:       ; %bb.0: ; %entry
2624; GFX10-CU-NEXT:    s_clause 0x1
2625; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2626; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2627; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2630; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2631; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2632; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2633; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2634; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2635; GFX10-CU-NEXT:    buffer_gl0_inv
2636; GFX10-CU-NEXT:    buffer_gl1_inv
2637; GFX10-CU-NEXT:    s_endpgm
2638;
2639; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2640; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2641; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2642; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2643; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2644; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2645; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2648; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2649; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2650; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2651; SKIP-CACHE-INV-NEXT:    s_endpgm
2652;
2653; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2654; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2655; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2657; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2658; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2660; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2661; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2662; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2663; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2664; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2665; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2666; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2667;
2668; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2669; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2670; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2671; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2672; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2673; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2675; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2676; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2677; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
2678; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2679; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2680; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2681; GFX90A-TGSPLIT-NEXT:    s_endpgm
2682    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2683entry:
2684  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2685  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
2686  ret void
2687}
2688
2689define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
2690; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2691; GFX6:       ; %bb.0: ; %entry
2692; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2693; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2694; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2695; GFX6-NEXT:    s_mov_b32 s2, -1
2696; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2698; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2699; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2700; GFX6-NEXT:    s_waitcnt vmcnt(0)
2701; GFX6-NEXT:    buffer_wbinvl1
2702; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2703; GFX6-NEXT:    s_endpgm
2704;
2705; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2706; GFX7:       ; %bb.0: ; %entry
2707; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2708; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2709; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2710; GFX7-NEXT:    s_add_u32 s4, s0, 16
2711; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2712; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2713; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2714; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2715; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2716; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2717; GFX7-NEXT:    s_waitcnt vmcnt(0)
2718; GFX7-NEXT:    buffer_wbinvl1_vol
2719; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2720; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2721; GFX7-NEXT:    flat_store_dword v[0:1], v2
2722; GFX7-NEXT:    s_endpgm
2723;
2724; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2725; GFX10-WGP:       ; %bb.0: ; %entry
2726; GFX10-WGP-NEXT:    s_clause 0x1
2727; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2728; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2729; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2730; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2731; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2732; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2733; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2734; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2735; GFX10-WGP-NEXT:    buffer_gl0_inv
2736; GFX10-WGP-NEXT:    buffer_gl1_inv
2737; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2738; GFX10-WGP-NEXT:    s_endpgm
2739;
2740; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2741; GFX10-CU:       ; %bb.0: ; %entry
2742; GFX10-CU-NEXT:    s_clause 0x1
2743; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2744; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2745; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2746; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2747; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2748; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2749; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2750; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2751; GFX10-CU-NEXT:    buffer_gl0_inv
2752; GFX10-CU-NEXT:    buffer_gl1_inv
2753; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2754; GFX10-CU-NEXT:    s_endpgm
2755;
2756; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2757; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2758; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2759; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2760; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2761; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2762; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2764; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2765; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2766; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2767; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2768; SKIP-CACHE-INV-NEXT:    s_endpgm
2769;
2770; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2771; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2772; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2774; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2775; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2776; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2777; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2778; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2779; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2780; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2781; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2782; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2783;
2784; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2785; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2786; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2787; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2788; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2789; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2790; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2791; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2792; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2793; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2794; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2795; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2796; GFX90A-TGSPLIT-NEXT:    s_endpgm
2797    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2798entry:
2799  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2800  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic
2801  %val0 = extractvalue { i32, i1 } %val, 0
2802  store i32 %val0, i32 addrspace(1)* %out, align 4
2803  ret void
2804}
2805
2806define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
2807; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2808; GFX6:       ; %bb.0: ; %entry
2809; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2810; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2811; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2812; GFX6-NEXT:    s_mov_b32 s2, -1
2813; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2814; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2815; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2816; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2817; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2818; GFX6-NEXT:    s_waitcnt vmcnt(0)
2819; GFX6-NEXT:    buffer_wbinvl1
2820; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2821; GFX6-NEXT:    s_endpgm
2822;
2823; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2824; GFX7:       ; %bb.0: ; %entry
2825; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2826; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2827; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2828; GFX7-NEXT:    s_add_u32 s4, s0, 16
2829; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2830; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2831; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2832; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2833; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2834; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2835; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2836; GFX7-NEXT:    s_waitcnt vmcnt(0)
2837; GFX7-NEXT:    buffer_wbinvl1_vol
2838; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2839; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2840; GFX7-NEXT:    flat_store_dword v[0:1], v2
2841; GFX7-NEXT:    s_endpgm
2842;
2843; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2844; GFX10-WGP:       ; %bb.0: ; %entry
2845; GFX10-WGP-NEXT:    s_clause 0x1
2846; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2847; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2848; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2849; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2850; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2851; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2852; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2853; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2854; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2855; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2856; GFX10-WGP-NEXT:    buffer_gl0_inv
2857; GFX10-WGP-NEXT:    buffer_gl1_inv
2858; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2859; GFX10-WGP-NEXT:    s_endpgm
2860;
2861; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2862; GFX10-CU:       ; %bb.0: ; %entry
2863; GFX10-CU-NEXT:    s_clause 0x1
2864; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2865; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2866; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2867; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2868; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2869; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2870; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2871; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2872; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2873; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2874; GFX10-CU-NEXT:    buffer_gl0_inv
2875; GFX10-CU-NEXT:    buffer_gl1_inv
2876; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2877; GFX10-CU-NEXT:    s_endpgm
2878;
2879; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2880; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2881; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2882; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2883; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2884; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2885; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2886; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2887; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2888; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2889; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2890; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2891; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2892; SKIP-CACHE-INV-NEXT:    s_endpgm
2893;
2894; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2895; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2896; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2897; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2898; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2899; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2901; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2902; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2903; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2904; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2905; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2906; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2907; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2908; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2909;
2910; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2911; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2912; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2913; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2914; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
2915; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2916; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2917; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2918; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2919; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
2920; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2921; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2922; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2923; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
2924; GFX90A-TGSPLIT-NEXT:    s_endpgm
2925    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2926entry:
2927  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2928  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic
2929  %val0 = extractvalue { i32, i1 } %val, 0
2930  store i32 %val0, i32 addrspace(1)* %out, align 4
2931  ret void
2932}
2933
2934define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
2935; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2936; GFX6:       ; %bb.0: ; %entry
2937; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2938; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
2939; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
2940; GFX6-NEXT:    s_mov_b32 s2, -1
2941; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2943; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2944; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2945; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
2946; GFX6-NEXT:    s_waitcnt vmcnt(0)
2947; GFX6-NEXT:    buffer_wbinvl1
2948; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2949; GFX6-NEXT:    s_endpgm
2950;
2951; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2952; GFX7:       ; %bb.0: ; %entry
2953; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2954; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2955; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2956; GFX7-NEXT:    s_add_u32 s4, s0, 16
2957; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2958; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2959; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2960; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2961; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2962; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2963; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2964; GFX7-NEXT:    s_waitcnt vmcnt(0)
2965; GFX7-NEXT:    buffer_wbinvl1_vol
2966; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2967; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2968; GFX7-NEXT:    flat_store_dword v[0:1], v2
2969; GFX7-NEXT:    s_endpgm
2970;
2971; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2972; GFX10-WGP:       ; %bb.0: ; %entry
2973; GFX10-WGP-NEXT:    s_clause 0x1
2974; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2975; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2976; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2977; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2978; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2979; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2980; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2981; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2982; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2983; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2984; GFX10-WGP-NEXT:    buffer_gl0_inv
2985; GFX10-WGP-NEXT:    buffer_gl1_inv
2986; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2987; GFX10-WGP-NEXT:    s_endpgm
2988;
2989; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2990; GFX10-CU:       ; %bb.0: ; %entry
2991; GFX10-CU-NEXT:    s_clause 0x1
2992; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2993; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2994; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2995; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2996; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2997; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2998; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2999; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3000; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3001; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3002; GFX10-CU-NEXT:    buffer_gl0_inv
3003; GFX10-CU-NEXT:    buffer_gl1_inv
3004; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3005; GFX10-CU-NEXT:    s_endpgm
3006;
3007; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
3008; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3009; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3010; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3011; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3012; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3013; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3014; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3015; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3016; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3017; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3018; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3019; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3020; SKIP-CACHE-INV-NEXT:    s_endpgm
3021;
3022; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
3023; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3024; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3025; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3026; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3028; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3029; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3030; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3031; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3032; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3033; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3034; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3035; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3036; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3037;
3038; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
3039; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3040; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3041; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3042; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3043; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3044; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3045; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3046; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3047; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3048; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3049; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3050; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3051; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3052; GFX90A-TGSPLIT-NEXT:    s_endpgm
3053    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3054entry:
3055  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3056  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic
3057  %val0 = extractvalue { i32, i1 } %val, 0
3058  store i32 %val0, i32 addrspace(1)* %out, align 4
3059  ret void
3060}
3061
3062define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
3063; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3064; GFX6:       ; %bb.0: ; %entry
3065; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3066; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3067; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3068; GFX6-NEXT:    s_mov_b32 s2, -1
3069; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3070; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3071; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3072; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3073; GFX6-NEXT:    s_waitcnt vmcnt(0)
3074; GFX6-NEXT:    buffer_wbinvl1
3075; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3076; GFX6-NEXT:    s_endpgm
3077;
3078; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3079; GFX7:       ; %bb.0: ; %entry
3080; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3081; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3082; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3083; GFX7-NEXT:    s_add_u32 s4, s0, 16
3084; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3085; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3086; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3087; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3088; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3089; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3090; GFX7-NEXT:    s_waitcnt vmcnt(0)
3091; GFX7-NEXT:    buffer_wbinvl1_vol
3092; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3093; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3094; GFX7-NEXT:    flat_store_dword v[0:1], v2
3095; GFX7-NEXT:    s_endpgm
3096;
3097; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3098; GFX10-WGP:       ; %bb.0: ; %entry
3099; GFX10-WGP-NEXT:    s_clause 0x1
3100; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3101; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3102; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3103; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3104; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3105; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3106; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3107; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3108; GFX10-WGP-NEXT:    buffer_gl0_inv
3109; GFX10-WGP-NEXT:    buffer_gl1_inv
3110; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3111; GFX10-WGP-NEXT:    s_endpgm
3112;
3113; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3114; GFX10-CU:       ; %bb.0: ; %entry
3115; GFX10-CU-NEXT:    s_clause 0x1
3116; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3117; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3118; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3119; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3120; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3121; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3122; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3123; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3124; GFX10-CU-NEXT:    buffer_gl0_inv
3125; GFX10-CU-NEXT:    buffer_gl1_inv
3126; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3127; GFX10-CU-NEXT:    s_endpgm
3128;
3129; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3130; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3131; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3132; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3133; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3134; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3135; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3136; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3137; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3138; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3139; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3140; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3141; SKIP-CACHE-INV-NEXT:    s_endpgm
3142;
3143; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3144; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3145; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3146; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3147; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3148; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3149; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3150; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3151; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3152; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3153; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3154; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3155; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3156;
3157; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
3158; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3159; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3160; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3161; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3162; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3163; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3164; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3165; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3166; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3167; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3168; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3169; GFX90A-TGSPLIT-NEXT:    s_endpgm
3170    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3171entry:
3172  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3173  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire
3174  %val0 = extractvalue { i32, i1 } %val, 0
3175  store i32 %val0, i32 addrspace(1)* %out, align 4
3176  ret void
3177}
3178
3179define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
3180; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg:
3181; GFX6:       ; %bb.0: ; %entry
3182; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3183; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3184; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3185; GFX6-NEXT:    s_mov_b32 s2, -1
3186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3187; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3188; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3189; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3190; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3191; GFX6-NEXT:    s_waitcnt vmcnt(0)
3192; GFX6-NEXT:    buffer_wbinvl1
3193; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3194; GFX6-NEXT:    s_endpgm
3195;
3196; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
3197; GFX7:       ; %bb.0: ; %entry
3198; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3199; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3200; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3201; GFX7-NEXT:    s_add_u32 s4, s0, 16
3202; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3203; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3204; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3205; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3206; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3207; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3208; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3209; GFX7-NEXT:    s_waitcnt vmcnt(0)
3210; GFX7-NEXT:    buffer_wbinvl1_vol
3211; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3212; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3213; GFX7-NEXT:    flat_store_dword v[0:1], v2
3214; GFX7-NEXT:    s_endpgm
3215;
3216; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
3217; GFX10-WGP:       ; %bb.0: ; %entry
3218; GFX10-WGP-NEXT:    s_clause 0x1
3219; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3220; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3221; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3222; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3223; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3224; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3225; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3226; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3227; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3228; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3229; GFX10-WGP-NEXT:    buffer_gl0_inv
3230; GFX10-WGP-NEXT:    buffer_gl1_inv
3231; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3232; GFX10-WGP-NEXT:    s_endpgm
3233;
3234; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
3235; GFX10-CU:       ; %bb.0: ; %entry
3236; GFX10-CU-NEXT:    s_clause 0x1
3237; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3238; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3239; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3240; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3241; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3242; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3243; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3244; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3245; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3246; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3247; GFX10-CU-NEXT:    buffer_gl0_inv
3248; GFX10-CU-NEXT:    buffer_gl1_inv
3249; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3250; GFX10-CU-NEXT:    s_endpgm
3251;
3252; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg:
3253; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3254; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3255; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3256; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3257; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3258; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3259; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3260; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3261; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3262; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3263; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3264; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3265; SKIP-CACHE-INV-NEXT:    s_endpgm
3266;
3267; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
3268; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3269; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3270; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3271; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3272; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3273; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3274; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3275; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3276; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3277; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3278; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3279; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3280; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3281; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3282;
3283; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
3284; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3285; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3286; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3287; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3288; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3289; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3290; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3291; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3292; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3293; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3294; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3295; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3296; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3297; GFX90A-TGSPLIT-NEXT:    s_endpgm
3298    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3299entry:
3300  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3301  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire
3302  %val0 = extractvalue { i32, i1 } %val, 0
3303  store i32 %val0, i32 addrspace(1)* %out, align 4
3304  ret void
3305}
3306
3307define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
3308; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3309; GFX6:       ; %bb.0: ; %entry
3310; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3311; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3312; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3313; GFX6-NEXT:    s_mov_b32 s2, -1
3314; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3315; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3316; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3317; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3318; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3319; GFX6-NEXT:    s_waitcnt vmcnt(0)
3320; GFX6-NEXT:    buffer_wbinvl1
3321; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3322; GFX6-NEXT:    s_endpgm
3323;
3324; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3325; GFX7:       ; %bb.0: ; %entry
3326; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3327; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3328; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3329; GFX7-NEXT:    s_add_u32 s4, s0, 16
3330; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3331; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3332; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3333; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3334; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3335; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3336; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3337; GFX7-NEXT:    s_waitcnt vmcnt(0)
3338; GFX7-NEXT:    buffer_wbinvl1_vol
3339; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3340; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3341; GFX7-NEXT:    flat_store_dword v[0:1], v2
3342; GFX7-NEXT:    s_endpgm
3343;
3344; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3345; GFX10-WGP:       ; %bb.0: ; %entry
3346; GFX10-WGP-NEXT:    s_clause 0x1
3347; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3348; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3349; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3350; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3351; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3352; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3353; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3354; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3355; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3356; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3357; GFX10-WGP-NEXT:    buffer_gl0_inv
3358; GFX10-WGP-NEXT:    buffer_gl1_inv
3359; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3360; GFX10-WGP-NEXT:    s_endpgm
3361;
3362; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3363; GFX10-CU:       ; %bb.0: ; %entry
3364; GFX10-CU-NEXT:    s_clause 0x1
3365; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3366; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3367; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3368; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3369; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3370; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3371; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3372; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3373; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3374; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3375; GFX10-CU-NEXT:    buffer_gl0_inv
3376; GFX10-CU-NEXT:    buffer_gl1_inv
3377; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3378; GFX10-CU-NEXT:    s_endpgm
3379;
3380; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3381; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3382; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3383; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3384; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3385; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3386; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3387; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3388; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3389; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3390; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3391; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3392; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3393; SKIP-CACHE-INV-NEXT:    s_endpgm
3394;
3395; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3396; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3397; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3398; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3399; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3400; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3401; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3402; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3403; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3404; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3405; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3406; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3407; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3408; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3409; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3410;
3411; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
3412; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3413; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3414; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3415; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3416; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3417; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3418; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3419; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3420; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3421; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3422; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3423; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3424; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3425; GFX90A-TGSPLIT-NEXT:    s_endpgm
3426    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3427entry:
3428  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3429  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire
3430  %val0 = extractvalue { i32, i1 } %val, 0
3431  store i32 %val0, i32 addrspace(1)* %out, align 4
3432  ret void
3433}
3434
3435define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
3436; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3437; GFX6:       ; %bb.0: ; %entry
3438; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3439; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3440; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3441; GFX6-NEXT:    s_mov_b32 s2, -1
3442; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3443; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3444; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3445; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3446; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3447; GFX6-NEXT:    s_waitcnt vmcnt(0)
3448; GFX6-NEXT:    buffer_wbinvl1
3449; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3450; GFX6-NEXT:    s_endpgm
3451;
3452; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3453; GFX7:       ; %bb.0: ; %entry
3454; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3455; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3456; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3457; GFX7-NEXT:    s_add_u32 s4, s0, 16
3458; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3459; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3460; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3461; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3462; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3463; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3464; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3465; GFX7-NEXT:    s_waitcnt vmcnt(0)
3466; GFX7-NEXT:    buffer_wbinvl1_vol
3467; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3468; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3469; GFX7-NEXT:    flat_store_dword v[0:1], v2
3470; GFX7-NEXT:    s_endpgm
3471;
3472; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3473; GFX10-WGP:       ; %bb.0: ; %entry
3474; GFX10-WGP-NEXT:    s_clause 0x1
3475; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3476; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3477; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3478; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3479; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3480; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3481; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3482; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3483; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3484; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3485; GFX10-WGP-NEXT:    buffer_gl0_inv
3486; GFX10-WGP-NEXT:    buffer_gl1_inv
3487; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3488; GFX10-WGP-NEXT:    s_endpgm
3489;
3490; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3491; GFX10-CU:       ; %bb.0: ; %entry
3492; GFX10-CU-NEXT:    s_clause 0x1
3493; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3494; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3495; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3496; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3497; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3499; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3500; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3501; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3502; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3503; GFX10-CU-NEXT:    buffer_gl0_inv
3504; GFX10-CU-NEXT:    buffer_gl1_inv
3505; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3506; GFX10-CU-NEXT:    s_endpgm
3507;
3508; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3509; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3510; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3511; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3512; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3513; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3514; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3516; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3517; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3518; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3519; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3520; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3521; SKIP-CACHE-INV-NEXT:    s_endpgm
3522;
3523; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3524; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3525; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3526; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3527; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3528; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3529; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3530; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3531; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3532; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3533; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3534; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3535; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3536; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3537; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3538;
3539; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
3540; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3541; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3542; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3543; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3544; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3545; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3546; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3547; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3548; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3549; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3550; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3551; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3552; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3553; GFX90A-TGSPLIT-NEXT:    s_endpgm
3554    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3555entry:
3556  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3557  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire
3558  %val0 = extractvalue { i32, i1 } %val, 0
3559  store i32 %val0, i32 addrspace(1)* %out, align 4
3560  ret void
3561}
3562
3563define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
3564; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3565; GFX6:       ; %bb.0: ; %entry
3566; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3567; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
3568; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3569; GFX6-NEXT:    s_mov_b32 s2, -1
3570; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3571; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3572; GFX6-NEXT:    v_mov_b32_e32 v1, s5
3573; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3574; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
3575; GFX6-NEXT:    s_waitcnt vmcnt(0)
3576; GFX6-NEXT:    buffer_wbinvl1
3577; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3578; GFX6-NEXT:    s_endpgm
3579;
3580; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3581; GFX7:       ; %bb.0: ; %entry
3582; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3583; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3584; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3585; GFX7-NEXT:    s_add_u32 s4, s0, 16
3586; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3587; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3588; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3589; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3590; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3591; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3592; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3593; GFX7-NEXT:    s_waitcnt vmcnt(0)
3594; GFX7-NEXT:    buffer_wbinvl1_vol
3595; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3596; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3597; GFX7-NEXT:    flat_store_dword v[0:1], v2
3598; GFX7-NEXT:    s_endpgm
3599;
3600; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3601; GFX10-WGP:       ; %bb.0: ; %entry
3602; GFX10-WGP-NEXT:    s_clause 0x1
3603; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3604; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3605; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3606; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3607; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3608; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3609; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3610; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3611; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3612; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3613; GFX10-WGP-NEXT:    buffer_gl0_inv
3614; GFX10-WGP-NEXT:    buffer_gl1_inv
3615; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
3616; GFX10-WGP-NEXT:    s_endpgm
3617;
3618; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3619; GFX10-CU:       ; %bb.0: ; %entry
3620; GFX10-CU-NEXT:    s_clause 0x1
3621; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3622; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3623; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3624; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3625; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3626; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3627; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3628; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3629; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
3630; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3631; GFX10-CU-NEXT:    buffer_gl0_inv
3632; GFX10-CU-NEXT:    buffer_gl1_inv
3633; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
3634; GFX10-CU-NEXT:    s_endpgm
3635;
3636; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3637; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3638; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3639; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3640; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3641; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3642; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3645; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3646; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
3647; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3648; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3649; SKIP-CACHE-INV-NEXT:    s_endpgm
3650;
3651; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3652; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3655; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3656; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3657; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3658; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3659; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3660; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3661; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3662; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3663; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3664; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3665; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3666;
3667; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
3668; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3669; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3670; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3671; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
3672; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3673; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
3674; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3675; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3676; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
3677; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3678; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3679; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3680; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
3681; GFX90A-TGSPLIT-NEXT:    s_endpgm
3682    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3683entry:
3684  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3685  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
3686  %val0 = extractvalue { i32, i1 } %val, 0
3687  store i32 %val0, i32 addrspace(1)* %out, align 4
3688  ret void
3689}
3690
3691define amdgpu_kernel void @global_system_one_as_unordered_load(
3692; GFX6-LABEL: global_system_one_as_unordered_load:
3693; GFX6:       ; %bb.0: ; %entry
3694; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3695; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3696; GFX6-NEXT:    s_mov_b32 s2, -1
3697; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3698; GFX6-NEXT:    s_mov_b32 s0, s4
3699; GFX6-NEXT:    s_mov_b32 s1, s5
3700; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
3701; GFX6-NEXT:    s_mov_b32 s4, s6
3702; GFX6-NEXT:    s_mov_b32 s5, s7
3703; GFX6-NEXT:    s_mov_b32 s6, s2
3704; GFX6-NEXT:    s_mov_b32 s7, s3
3705; GFX6-NEXT:    s_waitcnt vmcnt(0)
3706; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3707; GFX6-NEXT:    s_endpgm
3708;
3709; GFX7-LABEL: global_system_one_as_unordered_load:
3710; GFX7:       ; %bb.0: ; %entry
3711; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3712; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3713; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3714; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3715; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3716; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3717; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3718; GFX7-NEXT:    s_waitcnt vmcnt(0)
3719; GFX7-NEXT:    flat_store_dword v[2:3], v0
3720; GFX7-NEXT:    s_endpgm
3721;
3722; GFX10-WGP-LABEL: global_system_one_as_unordered_load:
3723; GFX10-WGP:       ; %bb.0: ; %entry
3724; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3725; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3726; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3727; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
3728; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3729; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3730; GFX10-WGP-NEXT:    s_endpgm
3731;
3732; GFX10-CU-LABEL: global_system_one_as_unordered_load:
3733; GFX10-CU:       ; %bb.0: ; %entry
3734; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3735; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3736; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3737; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
3738; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3739; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3740; GFX10-CU-NEXT:    s_endpgm
3741;
3742; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load:
3743; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3744; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3745; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3746; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3747; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3748; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3750; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
3751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3752; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3753; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3754; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3755; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3756; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3757; SKIP-CACHE-INV-NEXT:    s_endpgm
3758;
3759; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
3760; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3761; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3762; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3763; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3764; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
3765; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3766; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3767; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3768;
3769; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load:
3770; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3771; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3772; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3773; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3774; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
3775; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3776; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3777; GFX90A-TGSPLIT-NEXT:    s_endpgm
3778    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3779entry:
3780  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4
3781  store i32 %val, i32 addrspace(1)* %out
3782  ret void
3783}
3784
3785define amdgpu_kernel void @global_system_one_as_monotonic_load(
3786; GFX6-LABEL: global_system_one_as_monotonic_load:
3787; GFX6:       ; %bb.0: ; %entry
3788; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3789; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3790; GFX6-NEXT:    s_mov_b32 s2, -1
3791; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3792; GFX6-NEXT:    s_mov_b32 s0, s4
3793; GFX6-NEXT:    s_mov_b32 s1, s5
3794; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3795; GFX6-NEXT:    s_mov_b32 s4, s6
3796; GFX6-NEXT:    s_mov_b32 s5, s7
3797; GFX6-NEXT:    s_mov_b32 s6, s2
3798; GFX6-NEXT:    s_mov_b32 s7, s3
3799; GFX6-NEXT:    s_waitcnt vmcnt(0)
3800; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3801; GFX6-NEXT:    s_endpgm
3802;
3803; GFX7-LABEL: global_system_one_as_monotonic_load:
3804; GFX7:       ; %bb.0: ; %entry
3805; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3806; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3807; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3808; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3809; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3810; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3811; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3812; GFX7-NEXT:    s_waitcnt vmcnt(0)
3813; GFX7-NEXT:    flat_store_dword v[2:3], v0
3814; GFX7-NEXT:    s_endpgm
3815;
3816; GFX10-WGP-LABEL: global_system_one_as_monotonic_load:
3817; GFX10-WGP:       ; %bb.0: ; %entry
3818; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3819; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3820; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3821; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3822; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3823; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3824; GFX10-WGP-NEXT:    s_endpgm
3825;
3826; GFX10-CU-LABEL: global_system_one_as_monotonic_load:
3827; GFX10-CU:       ; %bb.0: ; %entry
3828; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3829; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3830; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3831; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3832; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3833; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3834; GFX10-CU-NEXT:    s_endpgm
3835;
3836; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load:
3837; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3838; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3839; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3840; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3841; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3842; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3843; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3844; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3845; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3846; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3847; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3848; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3849; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3850; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3851; SKIP-CACHE-INV-NEXT:    s_endpgm
3852;
3853; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
3854; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3855; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3856; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3857; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3858; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3859; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3860; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3861; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3862;
3863; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
3864; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3865; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3866; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3867; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3868; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3869; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3870; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3871; GFX90A-TGSPLIT-NEXT:    s_endpgm
3872    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3873entry:
3874  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4
3875  store i32 %val, i32 addrspace(1)* %out
3876  ret void
3877}
3878
3879define amdgpu_kernel void @global_system_one_as_acquire_load(
3880; GFX6-LABEL: global_system_one_as_acquire_load:
3881; GFX6:       ; %bb.0: ; %entry
3882; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3883; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3884; GFX6-NEXT:    s_mov_b32 s2, -1
3885; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3886; GFX6-NEXT:    s_mov_b32 s0, s4
3887; GFX6-NEXT:    s_mov_b32 s1, s5
3888; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3889; GFX6-NEXT:    s_waitcnt vmcnt(0)
3890; GFX6-NEXT:    buffer_wbinvl1
3891; GFX6-NEXT:    s_mov_b32 s4, s6
3892; GFX6-NEXT:    s_mov_b32 s5, s7
3893; GFX6-NEXT:    s_mov_b32 s6, s2
3894; GFX6-NEXT:    s_mov_b32 s7, s3
3895; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3896; GFX6-NEXT:    s_endpgm
3897;
3898; GFX7-LABEL: global_system_one_as_acquire_load:
3899; GFX7:       ; %bb.0: ; %entry
3900; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3901; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3902; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3903; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3904; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3905; GFX7-NEXT:    s_waitcnt vmcnt(0)
3906; GFX7-NEXT:    buffer_wbinvl1_vol
3907; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3908; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3909; GFX7-NEXT:    flat_store_dword v[2:3], v0
3910; GFX7-NEXT:    s_endpgm
3911;
3912; GFX10-WGP-LABEL: global_system_one_as_acquire_load:
3913; GFX10-WGP:       ; %bb.0: ; %entry
3914; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3915; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3916; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3917; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3918; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3919; GFX10-WGP-NEXT:    buffer_gl0_inv
3920; GFX10-WGP-NEXT:    buffer_gl1_inv
3921; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3922; GFX10-WGP-NEXT:    s_endpgm
3923;
3924; GFX10-CU-LABEL: global_system_one_as_acquire_load:
3925; GFX10-CU:       ; %bb.0: ; %entry
3926; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3927; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3928; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3929; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3930; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3931; GFX10-CU-NEXT:    buffer_gl0_inv
3932; GFX10-CU-NEXT:    buffer_gl1_inv
3933; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3934; GFX10-CU-NEXT:    s_endpgm
3935;
3936; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load:
3937; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3938; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3939; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3940; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3941; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3942; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3943; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3944; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3945; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3946; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3947; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3948; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3949; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3950; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3951; SKIP-CACHE-INV-NEXT:    s_endpgm
3952;
3953; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
3954; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3955; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3956; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3957; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3958; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3959; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3960; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3961; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3962; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3963; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3964;
3965; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load:
3966; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3968; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3969; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3970; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
3971; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3972; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3973; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3974; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
3975; GFX90A-TGSPLIT-NEXT:    s_endpgm
3976    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3977entry:
3978  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4
3979  store i32 %val, i32 addrspace(1)* %out
3980  ret void
3981}
3982
3983define amdgpu_kernel void @global_system_one_as_seq_cst_load(
3984; GFX6-LABEL: global_system_one_as_seq_cst_load:
3985; GFX6:       ; %bb.0: ; %entry
3986; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3987; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
3988; GFX6-NEXT:    s_mov_b32 s2, -1
3989; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3990; GFX6-NEXT:    s_mov_b32 s0, s4
3991; GFX6-NEXT:    s_mov_b32 s1, s5
3992; GFX6-NEXT:    s_waitcnt vmcnt(0)
3993; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3994; GFX6-NEXT:    s_waitcnt vmcnt(0)
3995; GFX6-NEXT:    buffer_wbinvl1
3996; GFX6-NEXT:    s_mov_b32 s4, s6
3997; GFX6-NEXT:    s_mov_b32 s5, s7
3998; GFX6-NEXT:    s_mov_b32 s6, s2
3999; GFX6-NEXT:    s_mov_b32 s7, s3
4000; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4001; GFX6-NEXT:    s_endpgm
4002;
4003; GFX7-LABEL: global_system_one_as_seq_cst_load:
4004; GFX7:       ; %bb.0: ; %entry
4005; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4006; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4007; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4008; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4009; GFX7-NEXT:    s_waitcnt vmcnt(0)
4010; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
4011; GFX7-NEXT:    s_waitcnt vmcnt(0)
4012; GFX7-NEXT:    buffer_wbinvl1_vol
4013; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4014; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4015; GFX7-NEXT:    flat_store_dword v[2:3], v0
4016; GFX7-NEXT:    s_endpgm
4017;
4018; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load:
4019; GFX10-WGP:       ; %bb.0: ; %entry
4020; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4021; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4022; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4023; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4024; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
4025; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4026; GFX10-WGP-NEXT:    buffer_gl0_inv
4027; GFX10-WGP-NEXT:    buffer_gl1_inv
4028; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
4029; GFX10-WGP-NEXT:    s_endpgm
4030;
4031; GFX10-CU-LABEL: global_system_one_as_seq_cst_load:
4032; GFX10-CU:       ; %bb.0: ; %entry
4033; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4034; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4035; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4036; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4037; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
4038; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4039; GFX10-CU-NEXT:    buffer_gl0_inv
4040; GFX10-CU-NEXT:    buffer_gl1_inv
4041; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
4042; GFX10-CU-NEXT:    s_endpgm
4043;
4044; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load:
4045; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4046; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
4047; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4048; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4049; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4050; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4051; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4052; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4053; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
4054; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4055; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
4056; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
4057; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
4058; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
4059; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4060; SKIP-CACHE-INV-NEXT:    s_endpgm
4061;
4062; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
4063; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4064; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4065; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4066; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4067; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
4068; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4069; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4070; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4071; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
4072; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4073;
4074; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
4075; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4076; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4077; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4078; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4079; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
4080; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4081; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4082; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4083; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
4084; GFX90A-TGSPLIT-NEXT:    s_endpgm
4085    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
4086entry:
4087  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4
4088  store i32 %val, i32 addrspace(1)* %out
4089  ret void
4090}
4091
4092define amdgpu_kernel void @global_system_one_as_unordered_store(
4093; GFX6-LABEL: global_system_one_as_unordered_store:
4094; GFX6:       ; %bb.0: ; %entry
4095; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4096; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4097; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4098; GFX6-NEXT:    s_mov_b32 s2, -1
4099; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4100; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4101; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4102; GFX6-NEXT:    s_endpgm
4103;
4104; GFX7-LABEL: global_system_one_as_unordered_store:
4105; GFX7:       ; %bb.0: ; %entry
4106; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4107; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4108; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4109; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4110; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4111; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4112; GFX7-NEXT:    flat_store_dword v[0:1], v2
4113; GFX7-NEXT:    s_endpgm
4114;
4115; GFX10-WGP-LABEL: global_system_one_as_unordered_store:
4116; GFX10-WGP:       ; %bb.0: ; %entry
4117; GFX10-WGP-NEXT:    s_clause 0x1
4118; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4119; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4120; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4121; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4122; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4123; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4124; GFX10-WGP-NEXT:    s_endpgm
4125;
4126; GFX10-CU-LABEL: global_system_one_as_unordered_store:
4127; GFX10-CU:       ; %bb.0: ; %entry
4128; GFX10-CU-NEXT:    s_clause 0x1
4129; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4130; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4131; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4132; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4133; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4134; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4135; GFX10-CU-NEXT:    s_endpgm
4136;
4137; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store:
4138; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4139; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4140; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4141; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4142; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4143; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4145; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4146; SKIP-CACHE-INV-NEXT:    s_endpgm
4147;
4148; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
4149; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4150; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4151; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4152; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4153; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4154; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4155; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4156; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4157;
4158; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store:
4159; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4160; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4161; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4162; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4163; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4164; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4165; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4166; GFX90A-TGSPLIT-NEXT:    s_endpgm
4167    i32 %in, i32 addrspace(1)* %out) {
4168entry:
4169  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4
4170  ret void
4171}
4172
4173define amdgpu_kernel void @global_system_one_as_monotonic_store(
4174; GFX6-LABEL: global_system_one_as_monotonic_store:
4175; GFX6:       ; %bb.0: ; %entry
4176; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4177; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4178; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4179; GFX6-NEXT:    s_mov_b32 s2, -1
4180; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4181; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4182; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4183; GFX6-NEXT:    s_endpgm
4184;
4185; GFX7-LABEL: global_system_one_as_monotonic_store:
4186; GFX7:       ; %bb.0: ; %entry
4187; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4188; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4189; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4190; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4191; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4192; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4193; GFX7-NEXT:    flat_store_dword v[0:1], v2
4194; GFX7-NEXT:    s_endpgm
4195;
4196; GFX10-WGP-LABEL: global_system_one_as_monotonic_store:
4197; GFX10-WGP:       ; %bb.0: ; %entry
4198; GFX10-WGP-NEXT:    s_clause 0x1
4199; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4200; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4201; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4202; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4203; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4204; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4205; GFX10-WGP-NEXT:    s_endpgm
4206;
4207; GFX10-CU-LABEL: global_system_one_as_monotonic_store:
4208; GFX10-CU:       ; %bb.0: ; %entry
4209; GFX10-CU-NEXT:    s_clause 0x1
4210; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4211; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4212; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4213; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4214; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4215; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4216; GFX10-CU-NEXT:    s_endpgm
4217;
4218; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store:
4219; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4220; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4221; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4222; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4223; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4224; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4226; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4227; SKIP-CACHE-INV-NEXT:    s_endpgm
4228;
4229; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
4230; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4231; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4232; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4234; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4235; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4236; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4237; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4238;
4239; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
4240; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4241; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4242; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4243; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4244; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4245; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4246; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4247; GFX90A-TGSPLIT-NEXT:    s_endpgm
4248    i32 %in, i32 addrspace(1)* %out) {
4249entry:
4250  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4
4251  ret void
4252}
4253
4254define amdgpu_kernel void @global_system_one_as_release_store(
4255; GFX6-LABEL: global_system_one_as_release_store:
4256; GFX6:       ; %bb.0: ; %entry
4257; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4258; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4259; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4260; GFX6-NEXT:    s_mov_b32 s2, -1
4261; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4262; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4263; GFX6-NEXT:    s_waitcnt vmcnt(0)
4264; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4265; GFX6-NEXT:    s_endpgm
4266;
4267; GFX7-LABEL: global_system_one_as_release_store:
4268; GFX7:       ; %bb.0: ; %entry
4269; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4270; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4271; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4272; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4273; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4274; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4275; GFX7-NEXT:    s_waitcnt vmcnt(0)
4276; GFX7-NEXT:    flat_store_dword v[0:1], v2
4277; GFX7-NEXT:    s_endpgm
4278;
4279; GFX10-WGP-LABEL: global_system_one_as_release_store:
4280; GFX10-WGP:       ; %bb.0: ; %entry
4281; GFX10-WGP-NEXT:    s_clause 0x1
4282; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4283; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4284; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4285; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4286; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4287; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4288; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4289; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4290; GFX10-WGP-NEXT:    s_endpgm
4291;
4292; GFX10-CU-LABEL: global_system_one_as_release_store:
4293; GFX10-CU:       ; %bb.0: ; %entry
4294; GFX10-CU-NEXT:    s_clause 0x1
4295; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4296; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4297; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4298; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4299; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4300; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4301; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4302; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4303; GFX10-CU-NEXT:    s_endpgm
4304;
4305; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store:
4306; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4307; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4308; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4309; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4310; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4311; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4313; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4314; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4315; SKIP-CACHE-INV-NEXT:    s_endpgm
4316;
4317; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
4318; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4319; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4320; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4321; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4322; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4323; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4324; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4325; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4326; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4327; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4328;
4329; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store:
4330; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4331; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4332; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4333; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4334; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4335; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4336; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4337; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4338; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4339; GFX90A-TGSPLIT-NEXT:    s_endpgm
4340    i32 %in, i32 addrspace(1)* %out) {
4341entry:
4342  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4
4343  ret void
4344}
4345
4346define amdgpu_kernel void @global_system_one_as_seq_cst_store(
4347; GFX6-LABEL: global_system_one_as_seq_cst_store:
4348; GFX6:       ; %bb.0: ; %entry
4349; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
4350; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4351; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4352; GFX6-NEXT:    s_mov_b32 s2, -1
4353; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4354; GFX6-NEXT:    v_mov_b32_e32 v0, s6
4355; GFX6-NEXT:    s_waitcnt vmcnt(0)
4356; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4357; GFX6-NEXT:    s_endpgm
4358;
4359; GFX7-LABEL: global_system_one_as_seq_cst_store:
4360; GFX7:       ; %bb.0: ; %entry
4361; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4362; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4363; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4364; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4365; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4366; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4367; GFX7-NEXT:    s_waitcnt vmcnt(0)
4368; GFX7-NEXT:    flat_store_dword v[0:1], v2
4369; GFX7-NEXT:    s_endpgm
4370;
4371; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store:
4372; GFX10-WGP:       ; %bb.0: ; %entry
4373; GFX10-WGP-NEXT:    s_clause 0x1
4374; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4375; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4376; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4377; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4378; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4379; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4380; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4381; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4382; GFX10-WGP-NEXT:    s_endpgm
4383;
4384; GFX10-CU-LABEL: global_system_one_as_seq_cst_store:
4385; GFX10-CU:       ; %bb.0: ; %entry
4386; GFX10-CU-NEXT:    s_clause 0x1
4387; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4388; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4389; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4390; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4391; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4392; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4393; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4394; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4395; GFX10-CU-NEXT:    s_endpgm
4396;
4397; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store:
4398; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4399; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
4400; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4401; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
4402; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
4403; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4404; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4405; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4406; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4407; SKIP-CACHE-INV-NEXT:    s_endpgm
4408;
4409; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
4410; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4411; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4412; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4413; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4414; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4415; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4416; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4417; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4418; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4419; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4420;
4421; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
4422; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4423; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4424; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4425; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4426; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4427; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4428; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4429; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4430; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
4431; GFX90A-TGSPLIT-NEXT:    s_endpgm
4432    i32 %in, i32 addrspace(1)* %out) {
4433entry:
4434  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4
4435  ret void
4436}
4437
4438define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
4439; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw:
4440; GFX6:       ; %bb.0: ; %entry
4441; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4442; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4443; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4444; GFX6-NEXT:    s_mov_b32 s2, -1
4445; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4446; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4447; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4448; GFX6-NEXT:    s_endpgm
4449;
4450; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
4451; GFX7:       ; %bb.0: ; %entry
4452; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4453; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4454; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4455; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4456; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4457; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4458; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4459; GFX7-NEXT:    s_endpgm
4460;
4461; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
4462; GFX10-WGP:       ; %bb.0: ; %entry
4463; GFX10-WGP-NEXT:    s_clause 0x1
4464; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4465; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4466; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4467; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4468; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4469; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4470; GFX10-WGP-NEXT:    s_endpgm
4471;
4472; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
4473; GFX10-CU:       ; %bb.0: ; %entry
4474; GFX10-CU-NEXT:    s_clause 0x1
4475; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4476; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4477; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4478; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4479; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4480; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4481; GFX10-CU-NEXT:    s_endpgm
4482;
4483; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw:
4484; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4485; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4486; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4487; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4488; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4489; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4491; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4492; SKIP-CACHE-INV-NEXT:    s_endpgm
4493;
4494; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
4495; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4496; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4498; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4500; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4501; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4503;
4504; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
4505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4507; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4508; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4511; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4512; GFX90A-TGSPLIT-NEXT:    s_endpgm
4513    i32 addrspace(1)* %out, i32 %in) {
4514entry:
4515  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic
4516  ret void
4517}
4518
4519define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
4520; GFX6-LABEL: global_system_one_as_acquire_atomicrmw:
4521; GFX6:       ; %bb.0: ; %entry
4522; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4523; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4524; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4525; GFX6-NEXT:    s_mov_b32 s2, -1
4526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4527; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4528; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4529; GFX6-NEXT:    s_waitcnt vmcnt(0)
4530; GFX6-NEXT:    buffer_wbinvl1
4531; GFX6-NEXT:    s_endpgm
4532;
4533; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
4534; GFX7:       ; %bb.0: ; %entry
4535; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4536; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4537; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4538; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4539; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4540; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4541; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4542; GFX7-NEXT:    s_waitcnt vmcnt(0)
4543; GFX7-NEXT:    buffer_wbinvl1_vol
4544; GFX7-NEXT:    s_endpgm
4545;
4546; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
4547; GFX10-WGP:       ; %bb.0: ; %entry
4548; GFX10-WGP-NEXT:    s_clause 0x1
4549; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4550; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4551; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4552; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4553; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4554; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4555; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4556; GFX10-WGP-NEXT:    buffer_gl0_inv
4557; GFX10-WGP-NEXT:    buffer_gl1_inv
4558; GFX10-WGP-NEXT:    s_endpgm
4559;
4560; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
4561; GFX10-CU:       ; %bb.0: ; %entry
4562; GFX10-CU-NEXT:    s_clause 0x1
4563; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4564; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4565; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4566; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4567; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4568; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4569; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4570; GFX10-CU-NEXT:    buffer_gl0_inv
4571; GFX10-CU-NEXT:    buffer_gl1_inv
4572; GFX10-CU-NEXT:    s_endpgm
4573;
4574; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
4575; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4576; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4577; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4578; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4579; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4580; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4581; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4582; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4583; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4584; SKIP-CACHE-INV-NEXT:    s_endpgm
4585;
4586; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
4587; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4588; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4589; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4590; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4591; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4592; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4593; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4594; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4595; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4596; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4597; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4598;
4599; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
4600; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4601; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4602; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4603; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4604; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4605; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4606; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4607; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4608; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4609; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4610; GFX90A-TGSPLIT-NEXT:    s_endpgm
4611    i32 addrspace(1)* %out, i32 %in) {
4612entry:
4613  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
4614  ret void
4615}
4616
4617define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
4618; GFX6-LABEL: global_system_one_as_release_atomicrmw:
4619; GFX6:       ; %bb.0: ; %entry
4620; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4621; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4622; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4623; GFX6-NEXT:    s_mov_b32 s2, -1
4624; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4625; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4626; GFX6-NEXT:    s_waitcnt vmcnt(0)
4627; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4628; GFX6-NEXT:    s_endpgm
4629;
4630; GFX7-LABEL: global_system_one_as_release_atomicrmw:
4631; GFX7:       ; %bb.0: ; %entry
4632; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4633; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4634; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4635; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4636; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4637; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4638; GFX7-NEXT:    s_waitcnt vmcnt(0)
4639; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4640; GFX7-NEXT:    s_endpgm
4641;
4642; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw:
4643; GFX10-WGP:       ; %bb.0: ; %entry
4644; GFX10-WGP-NEXT:    s_clause 0x1
4645; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4646; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4647; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4648; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4649; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4650; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4651; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4652; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4653; GFX10-WGP-NEXT:    s_endpgm
4654;
4655; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw:
4656; GFX10-CU:       ; %bb.0: ; %entry
4657; GFX10-CU-NEXT:    s_clause 0x1
4658; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4659; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4660; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4661; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4662; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4663; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4664; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4665; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4666; GFX10-CU-NEXT:    s_endpgm
4667;
4668; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw:
4669; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4670; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4671; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4672; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4673; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4674; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4675; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4676; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4677; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4678; SKIP-CACHE-INV-NEXT:    s_endpgm
4679;
4680; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
4681; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4682; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4683; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4684; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4685; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4686; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4687; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4688; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4689; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4690; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4691;
4692; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
4693; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4694; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4695; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4696; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4697; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4698; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4699; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4700; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4701; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4702; GFX90A-TGSPLIT-NEXT:    s_endpgm
4703    i32 addrspace(1)* %out, i32 %in) {
4704entry:
4705  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release
4706  ret void
4707}
4708
4709define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
4710; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw:
4711; GFX6:       ; %bb.0: ; %entry
4712; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4713; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4714; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4715; GFX6-NEXT:    s_mov_b32 s2, -1
4716; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4717; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4718; GFX6-NEXT:    s_waitcnt vmcnt(0)
4719; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4720; GFX6-NEXT:    s_waitcnt vmcnt(0)
4721; GFX6-NEXT:    buffer_wbinvl1
4722; GFX6-NEXT:    s_endpgm
4723;
4724; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
4725; GFX7:       ; %bb.0: ; %entry
4726; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4727; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4728; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4729; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4730; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4731; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4732; GFX7-NEXT:    s_waitcnt vmcnt(0)
4733; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4734; GFX7-NEXT:    s_waitcnt vmcnt(0)
4735; GFX7-NEXT:    buffer_wbinvl1_vol
4736; GFX7-NEXT:    s_endpgm
4737;
4738; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
4739; GFX10-WGP:       ; %bb.0: ; %entry
4740; GFX10-WGP-NEXT:    s_clause 0x1
4741; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4742; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4743; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4744; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4745; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4746; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4747; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4748; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4749; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4750; GFX10-WGP-NEXT:    buffer_gl0_inv
4751; GFX10-WGP-NEXT:    buffer_gl1_inv
4752; GFX10-WGP-NEXT:    s_endpgm
4753;
4754; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
4755; GFX10-CU:       ; %bb.0: ; %entry
4756; GFX10-CU-NEXT:    s_clause 0x1
4757; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4758; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4759; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4760; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4761; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4762; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4763; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4764; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4765; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4766; GFX10-CU-NEXT:    buffer_gl0_inv
4767; GFX10-CU-NEXT:    buffer_gl1_inv
4768; GFX10-CU-NEXT:    s_endpgm
4769;
4770; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
4771; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4772; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4773; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4774; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4775; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4776; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4778; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4779; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4780; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4781; SKIP-CACHE-INV-NEXT:    s_endpgm
4782;
4783; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
4784; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4785; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4786; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4787; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4788; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4789; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4790; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4791; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4792; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4793; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4794; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4795; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4796; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4797;
4798; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
4799; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4800; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4801; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4802; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4803; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4804; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4805; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4806; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4807; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4808; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4809; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4810; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4811; GFX90A-TGSPLIT-NEXT:    s_endpgm
4812    i32 addrspace(1)* %out, i32 %in) {
4813entry:
4814  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
4815  ret void
4816}
4817
4818define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
4819; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw:
4820; GFX6:       ; %bb.0: ; %entry
4821; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4822; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4823; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4824; GFX6-NEXT:    s_mov_b32 s2, -1
4825; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4826; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4827; GFX6-NEXT:    s_waitcnt vmcnt(0)
4828; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
4829; GFX6-NEXT:    s_waitcnt vmcnt(0)
4830; GFX6-NEXT:    buffer_wbinvl1
4831; GFX6-NEXT:    s_endpgm
4832;
4833; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
4834; GFX7:       ; %bb.0: ; %entry
4835; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4836; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4837; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4839; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4840; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4841; GFX7-NEXT:    s_waitcnt vmcnt(0)
4842; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4843; GFX7-NEXT:    s_waitcnt vmcnt(0)
4844; GFX7-NEXT:    buffer_wbinvl1_vol
4845; GFX7-NEXT:    s_endpgm
4846;
4847; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
4848; GFX10-WGP:       ; %bb.0: ; %entry
4849; GFX10-WGP-NEXT:    s_clause 0x1
4850; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4851; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4852; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4853; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4854; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4855; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4856; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4857; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
4858; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4859; GFX10-WGP-NEXT:    buffer_gl0_inv
4860; GFX10-WGP-NEXT:    buffer_gl1_inv
4861; GFX10-WGP-NEXT:    s_endpgm
4862;
4863; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
4864; GFX10-CU:       ; %bb.0: ; %entry
4865; GFX10-CU-NEXT:    s_clause 0x1
4866; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4867; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4868; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4869; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4870; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4871; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4872; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4873; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
4874; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4875; GFX10-CU-NEXT:    buffer_gl0_inv
4876; GFX10-CU-NEXT:    buffer_gl1_inv
4877; GFX10-CU-NEXT:    s_endpgm
4878;
4879; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
4880; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4881; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4882; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4883; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4884; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4885; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4886; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4887; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4888; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
4889; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4890; SKIP-CACHE-INV-NEXT:    s_endpgm
4891;
4892; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
4893; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4894; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4895; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4896; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4897; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4898; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4899; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4900; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4901; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4902; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4903; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4904; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4905; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4906;
4907; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
4908; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4909; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4910; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4911; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4912; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4913; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4914; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4915; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4916; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
4917; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4918; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4919; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4920; GFX90A-TGSPLIT-NEXT:    s_endpgm
4921    i32 addrspace(1)* %out, i32 %in) {
4922entry:
4923  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
4924  ret void
4925}
4926
4927define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
4928; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw:
4929; GFX6:       ; %bb.0: ; %entry
4930; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4931; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
4932; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
4933; GFX6-NEXT:    s_mov_b32 s2, -1
4934; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4935; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4936; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
4937; GFX6-NEXT:    s_waitcnt vmcnt(0)
4938; GFX6-NEXT:    buffer_wbinvl1
4939; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4940; GFX6-NEXT:    s_endpgm
4941;
4942; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
4943; GFX7:       ; %bb.0: ; %entry
4944; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4945; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4946; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4947; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4948; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4949; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4950; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4951; GFX7-NEXT:    s_waitcnt vmcnt(0)
4952; GFX7-NEXT:    buffer_wbinvl1_vol
4953; GFX7-NEXT:    flat_store_dword v[0:1], v2
4954; GFX7-NEXT:    s_endpgm
4955;
4956; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
4957; GFX10-WGP:       ; %bb.0: ; %entry
4958; GFX10-WGP-NEXT:    s_clause 0x1
4959; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4960; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4961; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4962; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4963; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4964; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4965; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4966; GFX10-WGP-NEXT:    buffer_gl0_inv
4967; GFX10-WGP-NEXT:    buffer_gl1_inv
4968; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
4969; GFX10-WGP-NEXT:    s_endpgm
4970;
4971; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
4972; GFX10-CU:       ; %bb.0: ; %entry
4973; GFX10-CU-NEXT:    s_clause 0x1
4974; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4975; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4976; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4977; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4978; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4979; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
4980; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4981; GFX10-CU-NEXT:    buffer_gl0_inv
4982; GFX10-CU-NEXT:    buffer_gl1_inv
4983; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
4984; GFX10-CU-NEXT:    s_endpgm
4985;
4986; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw:
4987; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4988; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4989; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4990; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4991; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4992; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4993; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4994; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
4995; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4996; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4997; SKIP-CACHE-INV-NEXT:    s_endpgm
4998;
4999; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
5000; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5001; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5002; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5003; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5004; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5005; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5006; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5007; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5008; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5009; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5010; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5011; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5012;
5013; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
5014; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5015; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5016; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5017; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5018; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5019; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5020; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5021; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5022; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5023; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5024; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5025; GFX90A-TGSPLIT-NEXT:    s_endpgm
5026    i32 addrspace(1)* %out, i32 %in) {
5027entry:
5028  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
5029  store i32 %val, i32 addrspace(1)* %out, align 4
5030  ret void
5031}
5032
5033define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
5034; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5035; GFX6:       ; %bb.0: ; %entry
5036; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5037; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
5038; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5039; GFX6-NEXT:    s_mov_b32 s2, -1
5040; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5041; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5042; GFX6-NEXT:    s_waitcnt vmcnt(0)
5043; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
5044; GFX6-NEXT:    s_waitcnt vmcnt(0)
5045; GFX6-NEXT:    buffer_wbinvl1
5046; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5047; GFX6-NEXT:    s_endpgm
5048;
5049; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5050; GFX7:       ; %bb.0: ; %entry
5051; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5052; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5053; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5055; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5056; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5057; GFX7-NEXT:    s_waitcnt vmcnt(0)
5058; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5059; GFX7-NEXT:    s_waitcnt vmcnt(0)
5060; GFX7-NEXT:    buffer_wbinvl1_vol
5061; GFX7-NEXT:    flat_store_dword v[0:1], v2
5062; GFX7-NEXT:    s_endpgm
5063;
5064; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5065; GFX10-WGP:       ; %bb.0: ; %entry
5066; GFX10-WGP-NEXT:    s_clause 0x1
5067; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5068; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5069; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5070; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5071; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5072; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5073; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5074; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5075; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5076; GFX10-WGP-NEXT:    buffer_gl0_inv
5077; GFX10-WGP-NEXT:    buffer_gl1_inv
5078; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
5079; GFX10-WGP-NEXT:    s_endpgm
5080;
5081; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5082; GFX10-CU:       ; %bb.0: ; %entry
5083; GFX10-CU-NEXT:    s_clause 0x1
5084; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5085; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5086; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5087; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5088; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5089; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5090; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5091; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5092; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5093; GFX10-CU-NEXT:    buffer_gl0_inv
5094; GFX10-CU-NEXT:    buffer_gl1_inv
5095; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
5096; GFX10-CU-NEXT:    s_endpgm
5097;
5098; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5099; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5100; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5101; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5102; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5103; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5104; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5105; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5106; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5107; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
5108; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5109; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5110; SKIP-CACHE-INV-NEXT:    s_endpgm
5111;
5112; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5113; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5114; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5115; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5116; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5117; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5118; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5119; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5120; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5121; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5122; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5123; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5124; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5125; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5126; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5127;
5128; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
5129; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5130; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5131; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5132; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5133; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5134; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5135; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5136; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5137; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5138; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5139; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5140; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5141; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5142; GFX90A-TGSPLIT-NEXT:    s_endpgm
5143    i32 addrspace(1)* %out, i32 %in) {
5144entry:
5145  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
5146  store i32 %val, i32 addrspace(1)* %out, align 4
5147  ret void
5148}
5149
5150define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
5151; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5152; GFX6:       ; %bb.0: ; %entry
5153; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5154; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
5155; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5156; GFX6-NEXT:    s_mov_b32 s2, -1
5157; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5158; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5159; GFX6-NEXT:    s_waitcnt vmcnt(0)
5160; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
5161; GFX6-NEXT:    s_waitcnt vmcnt(0)
5162; GFX6-NEXT:    buffer_wbinvl1
5163; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5164; GFX6-NEXT:    s_endpgm
5165;
5166; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5167; GFX7:       ; %bb.0: ; %entry
5168; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5169; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5170; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5171; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5172; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5173; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5174; GFX7-NEXT:    s_waitcnt vmcnt(0)
5175; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5176; GFX7-NEXT:    s_waitcnt vmcnt(0)
5177; GFX7-NEXT:    buffer_wbinvl1_vol
5178; GFX7-NEXT:    flat_store_dword v[0:1], v2
5179; GFX7-NEXT:    s_endpgm
5180;
5181; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5182; GFX10-WGP:       ; %bb.0: ; %entry
5183; GFX10-WGP-NEXT:    s_clause 0x1
5184; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5185; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5186; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5187; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5188; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5189; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5190; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5191; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5192; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5193; GFX10-WGP-NEXT:    buffer_gl0_inv
5194; GFX10-WGP-NEXT:    buffer_gl1_inv
5195; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
5196; GFX10-WGP-NEXT:    s_endpgm
5197;
5198; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5199; GFX10-CU:       ; %bb.0: ; %entry
5200; GFX10-CU-NEXT:    s_clause 0x1
5201; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5202; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5203; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5204; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5205; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5206; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5207; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5208; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5209; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5210; GFX10-CU-NEXT:    buffer_gl0_inv
5211; GFX10-CU-NEXT:    buffer_gl1_inv
5212; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
5213; GFX10-CU-NEXT:    s_endpgm
5214;
5215; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5216; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5217; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5218; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5221; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5223; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5224; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
5225; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5226; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5227; SKIP-CACHE-INV-NEXT:    s_endpgm
5228;
5229; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5230; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5231; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5232; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5234; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5235; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5236; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5237; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5238; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5239; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5240; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5241; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5242; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5243; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5244;
5245; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
5246; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5247; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5248; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5249; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5250; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5251; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5252; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5253; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5254; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
5255; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5256; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5257; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5258; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
5259; GFX90A-TGSPLIT-NEXT:    s_endpgm
5260    i32 addrspace(1)* %out, i32 %in) {
5261entry:
5262  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
5263  store i32 %val, i32 addrspace(1)* %out, align 4
5264  ret void
5265}
5266
5267define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
5268; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5269; GFX6:       ; %bb.0: ; %entry
5270; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5271; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5272; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5273; GFX6-NEXT:    s_mov_b32 s2, -1
5274; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5275; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5276; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5277; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5278; GFX6-NEXT:    s_endpgm
5279;
5280; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5281; GFX7:       ; %bb.0: ; %entry
5282; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5283; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5284; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5285; GFX7-NEXT:    s_add_u32 s0, s0, 16
5286; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5287; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5288; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5289; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5290; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5291; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5292; GFX7-NEXT:    s_endpgm
5293;
5294; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5295; GFX10-WGP:       ; %bb.0: ; %entry
5296; GFX10-WGP-NEXT:    s_clause 0x1
5297; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5298; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5299; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5300; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5301; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5302; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5303; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5304; GFX10-WGP-NEXT:    s_endpgm
5305;
5306; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5307; GFX10-CU:       ; %bb.0: ; %entry
5308; GFX10-CU-NEXT:    s_clause 0x1
5309; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5310; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5311; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5312; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5313; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5314; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5315; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5316; GFX10-CU-NEXT:    s_endpgm
5317;
5318; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5319; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5321; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5322; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5323; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5324; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5327; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5328; SKIP-CACHE-INV-NEXT:    s_endpgm
5329;
5330; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5331; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5332; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5333; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5334; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5335; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5336; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5337; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5338; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5339;
5340; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
5341; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5342; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5343; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5344; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5345; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5346; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5347; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5348; GFX90A-TGSPLIT-NEXT:    s_endpgm
5349    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5350entry:
5351  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5352  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
5353  ret void
5354}
5355
5356define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
5357; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5358; GFX6:       ; %bb.0: ; %entry
5359; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5360; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5361; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5362; GFX6-NEXT:    s_mov_b32 s2, -1
5363; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5364; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5365; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5366; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5367; GFX6-NEXT:    s_waitcnt vmcnt(0)
5368; GFX6-NEXT:    buffer_wbinvl1
5369; GFX6-NEXT:    s_endpgm
5370;
5371; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5372; GFX7:       ; %bb.0: ; %entry
5373; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5374; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5375; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5376; GFX7-NEXT:    s_add_u32 s0, s0, 16
5377; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5378; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5379; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5380; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5381; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5382; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5383; GFX7-NEXT:    s_waitcnt vmcnt(0)
5384; GFX7-NEXT:    buffer_wbinvl1_vol
5385; GFX7-NEXT:    s_endpgm
5386;
5387; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5388; GFX10-WGP:       ; %bb.0: ; %entry
5389; GFX10-WGP-NEXT:    s_clause 0x1
5390; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5391; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5392; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5393; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5394; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5395; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5396; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5397; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5398; GFX10-WGP-NEXT:    buffer_gl0_inv
5399; GFX10-WGP-NEXT:    buffer_gl1_inv
5400; GFX10-WGP-NEXT:    s_endpgm
5401;
5402; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5403; GFX10-CU:       ; %bb.0: ; %entry
5404; GFX10-CU-NEXT:    s_clause 0x1
5405; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5406; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5407; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5408; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5409; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5410; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5411; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5412; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5413; GFX10-CU-NEXT:    buffer_gl0_inv
5414; GFX10-CU-NEXT:    buffer_gl1_inv
5415; GFX10-CU-NEXT:    s_endpgm
5416;
5417; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5418; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5419; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5420; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5421; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5422; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5423; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5424; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5425; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5426; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5427; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5428; SKIP-CACHE-INV-NEXT:    s_endpgm
5429;
5430; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5431; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5432; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5433; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5434; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5435; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5436; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5437; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5438; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5439; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5440; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5441; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5442;
5443; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
5444; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5445; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5446; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5447; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5448; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5449; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5450; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5451; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5452; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5453; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5454; GFX90A-TGSPLIT-NEXT:    s_endpgm
5455    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5456entry:
5457  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5458  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
5459  ret void
5460}
5461
5462define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
5463; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5464; GFX6:       ; %bb.0: ; %entry
5465; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5466; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5467; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5468; GFX6-NEXT:    s_mov_b32 s2, -1
5469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5470; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5471; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5472; GFX6-NEXT:    s_waitcnt vmcnt(0)
5473; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5474; GFX6-NEXT:    s_endpgm
5475;
5476; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5477; GFX7:       ; %bb.0: ; %entry
5478; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5479; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5480; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5481; GFX7-NEXT:    s_add_u32 s0, s0, 16
5482; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5483; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5484; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5485; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5486; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5487; GFX7-NEXT:    s_waitcnt vmcnt(0)
5488; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5489; GFX7-NEXT:    s_endpgm
5490;
5491; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5492; GFX10-WGP:       ; %bb.0: ; %entry
5493; GFX10-WGP-NEXT:    s_clause 0x1
5494; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5495; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5496; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5497; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5499; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5500; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5501; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5502; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5503; GFX10-WGP-NEXT:    s_endpgm
5504;
5505; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5506; GFX10-CU:       ; %bb.0: ; %entry
5507; GFX10-CU-NEXT:    s_clause 0x1
5508; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5509; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5510; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5511; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5512; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5513; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5514; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5515; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5516; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5517; GFX10-CU-NEXT:    s_endpgm
5518;
5519; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5520; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5521; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5522; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5523; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5524; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5525; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5526; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5528; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5529; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5530; SKIP-CACHE-INV-NEXT:    s_endpgm
5531;
5532; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5533; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5534; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5535; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5536; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5537; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5538; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5539; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5540; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5541; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5542; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5543;
5544; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
5545; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5546; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5547; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5548; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5549; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5550; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5551; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5552; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5553; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5554; GFX90A-TGSPLIT-NEXT:    s_endpgm
5555    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5556entry:
5557  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5558  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
5559  ret void
5560}
5561
5562define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
5563; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5564; GFX6:       ; %bb.0: ; %entry
5565; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5566; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5567; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5568; GFX6-NEXT:    s_mov_b32 s2, -1
5569; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5570; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5571; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5572; GFX6-NEXT:    s_waitcnt vmcnt(0)
5573; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5574; GFX6-NEXT:    s_waitcnt vmcnt(0)
5575; GFX6-NEXT:    buffer_wbinvl1
5576; GFX6-NEXT:    s_endpgm
5577;
5578; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5579; GFX7:       ; %bb.0: ; %entry
5580; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5581; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5582; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5583; GFX7-NEXT:    s_add_u32 s0, s0, 16
5584; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5585; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5586; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5587; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5588; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5589; GFX7-NEXT:    s_waitcnt vmcnt(0)
5590; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5591; GFX7-NEXT:    s_waitcnt vmcnt(0)
5592; GFX7-NEXT:    buffer_wbinvl1_vol
5593; GFX7-NEXT:    s_endpgm
5594;
5595; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5596; GFX10-WGP:       ; %bb.0: ; %entry
5597; GFX10-WGP-NEXT:    s_clause 0x1
5598; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5599; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5600; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5601; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5602; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5603; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5604; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5605; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5606; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5607; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5608; GFX10-WGP-NEXT:    buffer_gl0_inv
5609; GFX10-WGP-NEXT:    buffer_gl1_inv
5610; GFX10-WGP-NEXT:    s_endpgm
5611;
5612; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5613; GFX10-CU:       ; %bb.0: ; %entry
5614; GFX10-CU-NEXT:    s_clause 0x1
5615; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5616; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5617; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5618; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5619; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5620; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5621; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5622; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5623; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5624; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5625; GFX10-CU-NEXT:    buffer_gl0_inv
5626; GFX10-CU-NEXT:    buffer_gl1_inv
5627; GFX10-CU-NEXT:    s_endpgm
5628;
5629; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5630; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5631; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5632; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5633; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5634; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5635; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5636; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5638; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5639; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5640; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5641; SKIP-CACHE-INV-NEXT:    s_endpgm
5642;
5643; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5644; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5645; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5646; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5647; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5648; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5649; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5650; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5651; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5652; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5653; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5654; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5655; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5656; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5657;
5658; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
5659; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5660; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5661; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5662; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5663; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5664; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5665; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5666; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5667; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5668; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5669; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5670; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5671; GFX90A-TGSPLIT-NEXT:    s_endpgm
5672    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5673entry:
5674  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5675  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
5676  ret void
5677}
5678
5679define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
5680; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5681; GFX6:       ; %bb.0: ; %entry
5682; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5683; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5684; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5685; GFX6-NEXT:    s_mov_b32 s2, -1
5686; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5687; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5688; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5689; GFX6-NEXT:    s_waitcnt vmcnt(0)
5690; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5691; GFX6-NEXT:    s_waitcnt vmcnt(0)
5692; GFX6-NEXT:    buffer_wbinvl1
5693; GFX6-NEXT:    s_endpgm
5694;
5695; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5696; GFX7:       ; %bb.0: ; %entry
5697; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5698; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5699; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5700; GFX7-NEXT:    s_add_u32 s0, s0, 16
5701; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5702; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5703; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5704; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5705; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5706; GFX7-NEXT:    s_waitcnt vmcnt(0)
5707; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5708; GFX7-NEXT:    s_waitcnt vmcnt(0)
5709; GFX7-NEXT:    buffer_wbinvl1_vol
5710; GFX7-NEXT:    s_endpgm
5711;
5712; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5713; GFX10-WGP:       ; %bb.0: ; %entry
5714; GFX10-WGP-NEXT:    s_clause 0x1
5715; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5716; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5717; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5718; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5719; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5720; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5721; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5722; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5723; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5724; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5725; GFX10-WGP-NEXT:    buffer_gl0_inv
5726; GFX10-WGP-NEXT:    buffer_gl1_inv
5727; GFX10-WGP-NEXT:    s_endpgm
5728;
5729; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5730; GFX10-CU:       ; %bb.0: ; %entry
5731; GFX10-CU-NEXT:    s_clause 0x1
5732; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5733; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5734; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5735; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5736; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5737; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5738; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5739; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5740; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5741; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5742; GFX10-CU-NEXT:    buffer_gl0_inv
5743; GFX10-CU-NEXT:    buffer_gl1_inv
5744; GFX10-CU-NEXT:    s_endpgm
5745;
5746; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5747; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5748; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5749; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5752; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5754; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5755; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5756; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5757; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5758; SKIP-CACHE-INV-NEXT:    s_endpgm
5759;
5760; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5761; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5762; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5763; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5764; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5765; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5766; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5767; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5769; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5770; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5771; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5772; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5773; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5774;
5775; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
5776; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5777; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5778; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5779; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5780; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5781; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5782; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5783; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5784; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5785; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5786; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5787; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5788; GFX90A-TGSPLIT-NEXT:    s_endpgm
5789    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5790entry:
5791  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5792  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
5793  ret void
5794}
5795
5796define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
5797; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5798; GFX6:       ; %bb.0: ; %entry
5799; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5800; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5801; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5802; GFX6-NEXT:    s_mov_b32 s2, -1
5803; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5804; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5805; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5806; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5807; GFX6-NEXT:    s_waitcnt vmcnt(0)
5808; GFX6-NEXT:    buffer_wbinvl1
5809; GFX6-NEXT:    s_endpgm
5810;
5811; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5812; GFX7:       ; %bb.0: ; %entry
5813; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5814; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5815; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5816; GFX7-NEXT:    s_add_u32 s0, s0, 16
5817; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5818; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5819; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5820; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5821; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5822; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5823; GFX7-NEXT:    s_waitcnt vmcnt(0)
5824; GFX7-NEXT:    buffer_wbinvl1_vol
5825; GFX7-NEXT:    s_endpgm
5826;
5827; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5828; GFX10-WGP:       ; %bb.0: ; %entry
5829; GFX10-WGP-NEXT:    s_clause 0x1
5830; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5831; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5832; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5833; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5834; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5835; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5836; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5837; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5838; GFX10-WGP-NEXT:    buffer_gl0_inv
5839; GFX10-WGP-NEXT:    buffer_gl1_inv
5840; GFX10-WGP-NEXT:    s_endpgm
5841;
5842; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5843; GFX10-CU:       ; %bb.0: ; %entry
5844; GFX10-CU-NEXT:    s_clause 0x1
5845; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5846; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5847; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5848; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5849; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5851; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5852; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5853; GFX10-CU-NEXT:    buffer_gl0_inv
5854; GFX10-CU-NEXT:    buffer_gl1_inv
5855; GFX10-CU-NEXT:    s_endpgm
5856;
5857; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5858; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5859; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5860; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5861; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5862; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5863; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5864; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5865; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5866; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5867; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5868; SKIP-CACHE-INV-NEXT:    s_endpgm
5869;
5870; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5871; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5872; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5873; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5874; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5875; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5876; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5877; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5878; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5879; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5880; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5881; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5882;
5883; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
5884; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5885; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5886; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5887; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5888; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5889; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5890; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5891; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5892; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5893; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5894; GFX90A-TGSPLIT-NEXT:    s_endpgm
5895    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5896entry:
5897  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5898  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
5899  ret void
5900}
5901
5902define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
5903; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg:
5904; GFX6:       ; %bb.0: ; %entry
5905; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5906; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
5907; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
5908; GFX6-NEXT:    s_mov_b32 s2, -1
5909; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5910; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5911; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5912; GFX6-NEXT:    s_waitcnt vmcnt(0)
5913; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5914; GFX6-NEXT:    s_waitcnt vmcnt(0)
5915; GFX6-NEXT:    buffer_wbinvl1
5916; GFX6-NEXT:    s_endpgm
5917;
5918; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
5919; GFX7:       ; %bb.0: ; %entry
5920; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5921; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5922; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5923; GFX7-NEXT:    s_add_u32 s0, s0, 16
5924; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5925; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5926; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5927; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5928; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5929; GFX7-NEXT:    s_waitcnt vmcnt(0)
5930; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5931; GFX7-NEXT:    s_waitcnt vmcnt(0)
5932; GFX7-NEXT:    buffer_wbinvl1_vol
5933; GFX7-NEXT:    s_endpgm
5934;
5935; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
5936; GFX10-WGP:       ; %bb.0: ; %entry
5937; GFX10-WGP-NEXT:    s_clause 0x1
5938; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5939; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5940; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5941; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5942; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5943; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5944; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5945; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5946; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5947; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5948; GFX10-WGP-NEXT:    buffer_gl0_inv
5949; GFX10-WGP-NEXT:    buffer_gl1_inv
5950; GFX10-WGP-NEXT:    s_endpgm
5951;
5952; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
5953; GFX10-CU:       ; %bb.0: ; %entry
5954; GFX10-CU-NEXT:    s_clause 0x1
5955; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5956; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5957; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5958; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5959; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5960; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5961; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5962; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5963; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
5964; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5965; GFX10-CU-NEXT:    buffer_gl0_inv
5966; GFX10-CU-NEXT:    buffer_gl1_inv
5967; GFX10-CU-NEXT:    s_endpgm
5968;
5969; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
5970; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5971; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5972; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5973; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5974; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5975; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5976; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5977; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5978; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5979; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5980; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5981; SKIP-CACHE-INV-NEXT:    s_endpgm
5982;
5983; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
5984; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5985; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5986; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5987; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
5988; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5989; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
5990; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5991; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5992; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
5993; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5994; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5995; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5996; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5997;
5998; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
5999; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6000; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6001; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6002; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6003; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6004; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6005; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6006; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6007; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6008; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6009; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6010; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6011; GFX90A-TGSPLIT-NEXT:    s_endpgm
6012    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6013entry:
6014  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6015  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
6016  ret void
6017}
6018
6019define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
6020; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6021; GFX6:       ; %bb.0: ; %entry
6022; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6023; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6024; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6025; GFX6-NEXT:    s_mov_b32 s2, -1
6026; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6027; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6028; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6029; GFX6-NEXT:    s_waitcnt vmcnt(0)
6030; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
6031; GFX6-NEXT:    s_waitcnt vmcnt(0)
6032; GFX6-NEXT:    buffer_wbinvl1
6033; GFX6-NEXT:    s_endpgm
6034;
6035; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6036; GFX7:       ; %bb.0: ; %entry
6037; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6038; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6039; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6040; GFX7-NEXT:    s_add_u32 s0, s0, 16
6041; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6042; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6043; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6044; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6045; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6046; GFX7-NEXT:    s_waitcnt vmcnt(0)
6047; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6048; GFX7-NEXT:    s_waitcnt vmcnt(0)
6049; GFX7-NEXT:    buffer_wbinvl1_vol
6050; GFX7-NEXT:    s_endpgm
6051;
6052; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6053; GFX10-WGP:       ; %bb.0: ; %entry
6054; GFX10-WGP-NEXT:    s_clause 0x1
6055; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6056; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6057; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6058; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6059; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6060; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6061; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6062; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6063; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6064; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6065; GFX10-WGP-NEXT:    buffer_gl0_inv
6066; GFX10-WGP-NEXT:    buffer_gl1_inv
6067; GFX10-WGP-NEXT:    s_endpgm
6068;
6069; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6070; GFX10-CU:       ; %bb.0: ; %entry
6071; GFX10-CU-NEXT:    s_clause 0x1
6072; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6073; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6074; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6075; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6076; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6077; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6078; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6079; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6080; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6081; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6082; GFX10-CU-NEXT:    buffer_gl0_inv
6083; GFX10-CU-NEXT:    buffer_gl1_inv
6084; GFX10-CU-NEXT:    s_endpgm
6085;
6086; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6087; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6088; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6089; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6090; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6091; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6092; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6093; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6095; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6096; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
6097; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6098; SKIP-CACHE-INV-NEXT:    s_endpgm
6099;
6100; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6101; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6103; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6104; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6105; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6106; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6107; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6109; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6110; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6111; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6112; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6113; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6114;
6115; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
6116; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6117; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6119; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6120; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6121; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6122; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6123; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6124; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6125; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6126; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6127; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6128; GFX90A-TGSPLIT-NEXT:    s_endpgm
6129    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6130entry:
6131  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6132  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
6133  ret void
6134}
6135
6136define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
6137; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6138; GFX6:       ; %bb.0: ; %entry
6139; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6140; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6141; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6142; GFX6-NEXT:    s_mov_b32 s2, -1
6143; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6144; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6145; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6146; GFX6-NEXT:    s_waitcnt vmcnt(0)
6147; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
6148; GFX6-NEXT:    s_waitcnt vmcnt(0)
6149; GFX6-NEXT:    buffer_wbinvl1
6150; GFX6-NEXT:    s_endpgm
6151;
6152; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6153; GFX7:       ; %bb.0: ; %entry
6154; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6155; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6156; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6157; GFX7-NEXT:    s_add_u32 s0, s0, 16
6158; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6159; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6160; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6161; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6162; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6163; GFX7-NEXT:    s_waitcnt vmcnt(0)
6164; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6165; GFX7-NEXT:    s_waitcnt vmcnt(0)
6166; GFX7-NEXT:    buffer_wbinvl1_vol
6167; GFX7-NEXT:    s_endpgm
6168;
6169; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6170; GFX10-WGP:       ; %bb.0: ; %entry
6171; GFX10-WGP-NEXT:    s_clause 0x1
6172; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6173; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6174; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6175; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6176; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6177; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6178; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6179; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6180; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6181; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6182; GFX10-WGP-NEXT:    buffer_gl0_inv
6183; GFX10-WGP-NEXT:    buffer_gl1_inv
6184; GFX10-WGP-NEXT:    s_endpgm
6185;
6186; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6187; GFX10-CU:       ; %bb.0: ; %entry
6188; GFX10-CU-NEXT:    s_clause 0x1
6189; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6190; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6191; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6192; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6193; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6194; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6195; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6196; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6197; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6198; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6199; GFX10-CU-NEXT:    buffer_gl0_inv
6200; GFX10-CU-NEXT:    buffer_gl1_inv
6201; GFX10-CU-NEXT:    s_endpgm
6202;
6203; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6204; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6205; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6206; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6207; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6208; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6209; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6210; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6212; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6213; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
6214; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6215; SKIP-CACHE-INV-NEXT:    s_endpgm
6216;
6217; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6218; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6219; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6220; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6221; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6223; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6224; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6225; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6226; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6227; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6228; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6229; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6230; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6231;
6232; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
6233; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6234; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6235; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6236; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6237; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6238; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6239; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6240; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6241; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6242; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6243; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6244; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6245; GFX90A-TGSPLIT-NEXT:    s_endpgm
6246    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6247entry:
6248  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6249  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
6250  ret void
6251}
6252
6253define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
6254; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6255; GFX6:       ; %bb.0: ; %entry
6256; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6257; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6258; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6259; GFX6-NEXT:    s_mov_b32 s2, -1
6260; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6261; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6262; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6263; GFX6-NEXT:    s_waitcnt vmcnt(0)
6264; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
6265; GFX6-NEXT:    s_waitcnt vmcnt(0)
6266; GFX6-NEXT:    buffer_wbinvl1
6267; GFX6-NEXT:    s_endpgm
6268;
6269; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6270; GFX7:       ; %bb.0: ; %entry
6271; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6272; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6273; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6274; GFX7-NEXT:    s_add_u32 s0, s0, 16
6275; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6276; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6277; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6278; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6279; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6280; GFX7-NEXT:    s_waitcnt vmcnt(0)
6281; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6282; GFX7-NEXT:    s_waitcnt vmcnt(0)
6283; GFX7-NEXT:    buffer_wbinvl1_vol
6284; GFX7-NEXT:    s_endpgm
6285;
6286; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6287; GFX10-WGP:       ; %bb.0: ; %entry
6288; GFX10-WGP-NEXT:    s_clause 0x1
6289; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6290; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6291; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6292; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6293; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6294; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6295; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6296; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6297; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6298; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6299; GFX10-WGP-NEXT:    buffer_gl0_inv
6300; GFX10-WGP-NEXT:    buffer_gl1_inv
6301; GFX10-WGP-NEXT:    s_endpgm
6302;
6303; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6304; GFX10-CU:       ; %bb.0: ; %entry
6305; GFX10-CU-NEXT:    s_clause 0x1
6306; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6307; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6308; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6309; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6310; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6311; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6312; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6313; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6314; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
6315; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6316; GFX10-CU-NEXT:    buffer_gl0_inv
6317; GFX10-CU-NEXT:    buffer_gl1_inv
6318; GFX10-CU-NEXT:    s_endpgm
6319;
6320; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6321; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6322; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6323; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6324; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6325; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6326; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6329; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6330; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
6331; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6332; SKIP-CACHE-INV-NEXT:    s_endpgm
6333;
6334; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6335; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6336; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6337; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6338; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6339; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6340; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6341; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6342; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6343; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6344; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6345; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6346; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6347; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6348;
6349; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
6350; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6351; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6352; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6353; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6354; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6355; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6356; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6357; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6358; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
6359; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6360; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6361; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6362; GFX90A-TGSPLIT-NEXT:    s_endpgm
6363    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6364entry:
6365  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6366  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
6367  ret void
6368}
6369
6370define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
6371; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6372; GFX6:       ; %bb.0: ; %entry
6373; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6374; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6375; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6376; GFX6-NEXT:    s_mov_b32 s2, -1
6377; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6378; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6379; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6380; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6381; GFX6-NEXT:    s_waitcnt vmcnt(0)
6382; GFX6-NEXT:    buffer_wbinvl1
6383; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6384; GFX6-NEXT:    s_endpgm
6385;
6386; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6387; GFX7:       ; %bb.0: ; %entry
6388; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6389; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6390; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6391; GFX7-NEXT:    s_add_u32 s4, s0, 16
6392; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6393; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6394; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6395; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6396; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6397; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6398; GFX7-NEXT:    s_waitcnt vmcnt(0)
6399; GFX7-NEXT:    buffer_wbinvl1_vol
6400; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6401; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6402; GFX7-NEXT:    flat_store_dword v[0:1], v2
6403; GFX7-NEXT:    s_endpgm
6404;
6405; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6406; GFX10-WGP:       ; %bb.0: ; %entry
6407; GFX10-WGP-NEXT:    s_clause 0x1
6408; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6409; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6410; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6411; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6412; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6413; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6414; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6415; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6416; GFX10-WGP-NEXT:    buffer_gl0_inv
6417; GFX10-WGP-NEXT:    buffer_gl1_inv
6418; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6419; GFX10-WGP-NEXT:    s_endpgm
6420;
6421; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6422; GFX10-CU:       ; %bb.0: ; %entry
6423; GFX10-CU-NEXT:    s_clause 0x1
6424; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6425; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6426; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6427; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6428; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6429; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6430; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6431; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6432; GFX10-CU-NEXT:    buffer_gl0_inv
6433; GFX10-CU-NEXT:    buffer_gl1_inv
6434; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6435; GFX10-CU-NEXT:    s_endpgm
6436;
6437; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6438; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6439; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6440; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6441; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6442; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6443; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6444; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6445; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6446; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6447; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6448; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6449; SKIP-CACHE-INV-NEXT:    s_endpgm
6450;
6451; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6452; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6453; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6454; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6455; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6456; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6457; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6458; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6459; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6460; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6461; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6462; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6463; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6464;
6465; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
6466; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6467; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6468; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6469; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6470; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6471; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6472; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6473; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6474; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6475; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6476; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6477; GFX90A-TGSPLIT-NEXT:    s_endpgm
6478    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6479entry:
6480  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6481  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
6482  %val0 = extractvalue { i32, i1 } %val, 0
6483  store i32 %val0, i32 addrspace(1)* %out, align 4
6484  ret void
6485}
6486
6487define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
6488; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6489; GFX6:       ; %bb.0: ; %entry
6490; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6491; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6492; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6493; GFX6-NEXT:    s_mov_b32 s2, -1
6494; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6495; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6496; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6497; GFX6-NEXT:    s_waitcnt vmcnt(0)
6498; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6499; GFX6-NEXT:    s_waitcnt vmcnt(0)
6500; GFX6-NEXT:    buffer_wbinvl1
6501; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6502; GFX6-NEXT:    s_endpgm
6503;
6504; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6505; GFX7:       ; %bb.0: ; %entry
6506; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6507; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6508; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6509; GFX7-NEXT:    s_add_u32 s4, s0, 16
6510; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6511; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6512; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6513; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6514; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6515; GFX7-NEXT:    s_waitcnt vmcnt(0)
6516; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6517; GFX7-NEXT:    s_waitcnt vmcnt(0)
6518; GFX7-NEXT:    buffer_wbinvl1_vol
6519; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6520; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6521; GFX7-NEXT:    flat_store_dword v[0:1], v2
6522; GFX7-NEXT:    s_endpgm
6523;
6524; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6525; GFX10-WGP:       ; %bb.0: ; %entry
6526; GFX10-WGP-NEXT:    s_clause 0x1
6527; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6528; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6529; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6530; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6531; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6532; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6533; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6534; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6535; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6536; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6537; GFX10-WGP-NEXT:    buffer_gl0_inv
6538; GFX10-WGP-NEXT:    buffer_gl1_inv
6539; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6540; GFX10-WGP-NEXT:    s_endpgm
6541;
6542; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6543; GFX10-CU:       ; %bb.0: ; %entry
6544; GFX10-CU-NEXT:    s_clause 0x1
6545; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6546; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6547; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6548; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6549; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6550; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6551; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6552; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6553; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6554; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6555; GFX10-CU-NEXT:    buffer_gl0_inv
6556; GFX10-CU-NEXT:    buffer_gl1_inv
6557; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6558; GFX10-CU-NEXT:    s_endpgm
6559;
6560; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6561; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6562; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6563; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6564; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6565; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6566; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6569; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6570; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6571; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6572; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6573; SKIP-CACHE-INV-NEXT:    s_endpgm
6574;
6575; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6576; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6577; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6578; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6579; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6581; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6582; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6583; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6584; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6585; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6586; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6587; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6588; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6589; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6590;
6591; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
6592; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6593; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6594; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6595; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6596; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6597; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6598; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6599; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6600; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6601; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6602; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6603; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6604; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6605; GFX90A-TGSPLIT-NEXT:    s_endpgm
6606    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6607entry:
6608  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6609  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
6610  %val0 = extractvalue { i32, i1 } %val, 0
6611  store i32 %val0, i32 addrspace(1)* %out, align 4
6612  ret void
6613}
6614
6615define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
6616; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6617; GFX6:       ; %bb.0: ; %entry
6618; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6619; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6620; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6621; GFX6-NEXT:    s_mov_b32 s2, -1
6622; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6623; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6624; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6625; GFX6-NEXT:    s_waitcnt vmcnt(0)
6626; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6627; GFX6-NEXT:    s_waitcnt vmcnt(0)
6628; GFX6-NEXT:    buffer_wbinvl1
6629; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6630; GFX6-NEXT:    s_endpgm
6631;
6632; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6633; GFX7:       ; %bb.0: ; %entry
6634; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6635; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6636; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6637; GFX7-NEXT:    s_add_u32 s4, s0, 16
6638; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6639; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6640; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6641; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6642; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6643; GFX7-NEXT:    s_waitcnt vmcnt(0)
6644; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6645; GFX7-NEXT:    s_waitcnt vmcnt(0)
6646; GFX7-NEXT:    buffer_wbinvl1_vol
6647; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6648; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6649; GFX7-NEXT:    flat_store_dword v[0:1], v2
6650; GFX7-NEXT:    s_endpgm
6651;
6652; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6653; GFX10-WGP:       ; %bb.0: ; %entry
6654; GFX10-WGP-NEXT:    s_clause 0x1
6655; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6656; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6657; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6658; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6659; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6660; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6661; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6662; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6663; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6664; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6665; GFX10-WGP-NEXT:    buffer_gl0_inv
6666; GFX10-WGP-NEXT:    buffer_gl1_inv
6667; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6668; GFX10-WGP-NEXT:    s_endpgm
6669;
6670; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6671; GFX10-CU:       ; %bb.0: ; %entry
6672; GFX10-CU-NEXT:    s_clause 0x1
6673; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6674; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6675; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6676; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6677; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6678; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6679; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6680; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6681; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6682; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6683; GFX10-CU-NEXT:    buffer_gl0_inv
6684; GFX10-CU-NEXT:    buffer_gl1_inv
6685; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6686; GFX10-CU-NEXT:    s_endpgm
6687;
6688; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6689; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6690; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6691; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6692; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6693; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6694; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6697; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6698; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6699; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6700; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6701; SKIP-CACHE-INV-NEXT:    s_endpgm
6702;
6703; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6704; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6705; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6706; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6707; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6708; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6709; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6710; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6711; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6712; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6713; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6714; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6715; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6716; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6717; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6718;
6719; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
6720; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6721; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6722; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6723; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6724; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6725; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6726; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6727; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6728; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6729; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6730; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6731; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6732; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6733; GFX90A-TGSPLIT-NEXT:    s_endpgm
6734    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6735entry:
6736  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6737  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
6738  %val0 = extractvalue { i32, i1 } %val, 0
6739  store i32 %val0, i32 addrspace(1)* %out, align 4
6740  ret void
6741}
6742
6743define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
6744; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6745; GFX6:       ; %bb.0: ; %entry
6746; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6747; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6748; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6749; GFX6-NEXT:    s_mov_b32 s2, -1
6750; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6751; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6752; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6753; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6754; GFX6-NEXT:    s_waitcnt vmcnt(0)
6755; GFX6-NEXT:    buffer_wbinvl1
6756; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6757; GFX6-NEXT:    s_endpgm
6758;
6759; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6760; GFX7:       ; %bb.0: ; %entry
6761; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6762; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6763; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6764; GFX7-NEXT:    s_add_u32 s4, s0, 16
6765; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6766; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6767; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6768; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6769; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6770; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6771; GFX7-NEXT:    s_waitcnt vmcnt(0)
6772; GFX7-NEXT:    buffer_wbinvl1_vol
6773; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6774; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6775; GFX7-NEXT:    flat_store_dword v[0:1], v2
6776; GFX7-NEXT:    s_endpgm
6777;
6778; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6779; GFX10-WGP:       ; %bb.0: ; %entry
6780; GFX10-WGP-NEXT:    s_clause 0x1
6781; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6782; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6783; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6784; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6785; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6786; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6787; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6788; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6789; GFX10-WGP-NEXT:    buffer_gl0_inv
6790; GFX10-WGP-NEXT:    buffer_gl1_inv
6791; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6792; GFX10-WGP-NEXT:    s_endpgm
6793;
6794; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6795; GFX10-CU:       ; %bb.0: ; %entry
6796; GFX10-CU-NEXT:    s_clause 0x1
6797; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6798; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6799; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6800; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6801; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6802; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6803; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6804; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6805; GFX10-CU-NEXT:    buffer_gl0_inv
6806; GFX10-CU-NEXT:    buffer_gl1_inv
6807; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6808; GFX10-CU-NEXT:    s_endpgm
6809;
6810; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6811; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6812; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6813; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6814; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6815; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6816; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6817; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6818; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6819; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6820; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6821; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6822; SKIP-CACHE-INV-NEXT:    s_endpgm
6823;
6824; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6825; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6826; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6827; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6828; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6829; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6830; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6831; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6832; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6833; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6834; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6835; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6836; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6837;
6838; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
6839; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6840; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6841; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6842; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6843; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6844; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6845; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6846; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6847; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6848; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6849; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6850; GFX90A-TGSPLIT-NEXT:    s_endpgm
6851    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6852entry:
6853  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6854  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
6855  %val0 = extractvalue { i32, i1 } %val, 0
6856  store i32 %val0, i32 addrspace(1)* %out, align 4
6857  ret void
6858}
6859
6860define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
6861; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6862; GFX6:       ; %bb.0: ; %entry
6863; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6864; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6865; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6866; GFX6-NEXT:    s_mov_b32 s2, -1
6867; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6868; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6869; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6870; GFX6-NEXT:    s_waitcnt vmcnt(0)
6871; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6872; GFX6-NEXT:    s_waitcnt vmcnt(0)
6873; GFX6-NEXT:    buffer_wbinvl1
6874; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6875; GFX6-NEXT:    s_endpgm
6876;
6877; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6878; GFX7:       ; %bb.0: ; %entry
6879; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6880; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6881; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6882; GFX7-NEXT:    s_add_u32 s4, s0, 16
6883; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6884; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6885; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6886; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6887; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6888; GFX7-NEXT:    s_waitcnt vmcnt(0)
6889; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6890; GFX7-NEXT:    s_waitcnt vmcnt(0)
6891; GFX7-NEXT:    buffer_wbinvl1_vol
6892; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6893; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6894; GFX7-NEXT:    flat_store_dword v[0:1], v2
6895; GFX7-NEXT:    s_endpgm
6896;
6897; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6898; GFX10-WGP:       ; %bb.0: ; %entry
6899; GFX10-WGP-NEXT:    s_clause 0x1
6900; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6901; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6902; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
6903; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6904; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6905; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6906; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6907; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6908; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6909; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6910; GFX10-WGP-NEXT:    buffer_gl0_inv
6911; GFX10-WGP-NEXT:    buffer_gl1_inv
6912; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
6913; GFX10-WGP-NEXT:    s_endpgm
6914;
6915; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6916; GFX10-CU:       ; %bb.0: ; %entry
6917; GFX10-CU-NEXT:    s_clause 0x1
6918; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
6919; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6920; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
6921; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6922; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6923; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6924; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6925; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6926; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
6927; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6928; GFX10-CU-NEXT:    buffer_gl0_inv
6929; GFX10-CU-NEXT:    buffer_gl1_inv
6930; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
6931; GFX10-CU-NEXT:    s_endpgm
6932;
6933; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6934; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6935; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6936; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6937; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
6938; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
6939; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6942; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6943; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6944; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6945; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6946; SKIP-CACHE-INV-NEXT:    s_endpgm
6947;
6948; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6949; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6950; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6951; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6952; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6953; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6954; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6955; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6956; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6957; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6958; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6959; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6960; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6961; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6962; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6963;
6964; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
6965; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6966; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6968; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
6969; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6970; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
6971; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6972; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6973; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
6974; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6975; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6976; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6977; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
6978; GFX90A-TGSPLIT-NEXT:    s_endpgm
6979    i32 addrspace(1)* %out, i32 %in, i32 %old) {
6980entry:
6981  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
6982  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
6983  %val0 = extractvalue { i32, i1 } %val, 0
6984  store i32 %val0, i32 addrspace(1)* %out, align 4
6985  ret void
6986}
6987
6988define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
6989; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
6990; GFX6:       ; %bb.0: ; %entry
6991; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6992; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
6993; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
6994; GFX6-NEXT:    s_mov_b32 s2, -1
6995; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6996; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6997; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6998; GFX6-NEXT:    s_waitcnt vmcnt(0)
6999; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7000; GFX6-NEXT:    s_waitcnt vmcnt(0)
7001; GFX6-NEXT:    buffer_wbinvl1
7002; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7003; GFX6-NEXT:    s_endpgm
7004;
7005; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7006; GFX7:       ; %bb.0: ; %entry
7007; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7008; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7009; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7010; GFX7-NEXT:    s_add_u32 s4, s0, 16
7011; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7012; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7013; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7014; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7015; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7016; GFX7-NEXT:    s_waitcnt vmcnt(0)
7017; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7018; GFX7-NEXT:    s_waitcnt vmcnt(0)
7019; GFX7-NEXT:    buffer_wbinvl1_vol
7020; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7021; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7022; GFX7-NEXT:    flat_store_dword v[0:1], v2
7023; GFX7-NEXT:    s_endpgm
7024;
7025; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7026; GFX10-WGP:       ; %bb.0: ; %entry
7027; GFX10-WGP-NEXT:    s_clause 0x1
7028; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7029; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7030; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
7031; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7032; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7033; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7034; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7035; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7036; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7037; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7038; GFX10-WGP-NEXT:    buffer_gl0_inv
7039; GFX10-WGP-NEXT:    buffer_gl1_inv
7040; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
7041; GFX10-WGP-NEXT:    s_endpgm
7042;
7043; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7044; GFX10-CU:       ; %bb.0: ; %entry
7045; GFX10-CU-NEXT:    s_clause 0x1
7046; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7047; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7048; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
7049; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7050; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7051; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7052; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7053; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7054; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7055; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7056; GFX10-CU-NEXT:    buffer_gl0_inv
7057; GFX10-CU-NEXT:    buffer_gl1_inv
7058; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
7059; GFX10-CU-NEXT:    s_endpgm
7060;
7061; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7062; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7063; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7064; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7065; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
7066; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
7067; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7069; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7070; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7071; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7072; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7073; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7074; SKIP-CACHE-INV-NEXT:    s_endpgm
7075;
7076; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7077; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7078; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7079; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7080; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7081; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7082; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7083; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7084; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7085; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7086; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7087; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7088; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7089; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7090; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7091;
7092; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
7093; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7094; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7095; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7097; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7098; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7099; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7100; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7101; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7102; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7103; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7104; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7105; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7106; GFX90A-TGSPLIT-NEXT:    s_endpgm
7107    i32 addrspace(1)* %out, i32 %in, i32 %old) {
7108entry:
7109  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
7110  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
7111  %val0 = extractvalue { i32, i1 } %val, 0
7112  store i32 %val0, i32 addrspace(1)* %out, align 4
7113  ret void
7114}
7115
7116define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
7117; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7118; GFX6:       ; %bb.0: ; %entry
7119; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7120; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
7121; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
7122; GFX6-NEXT:    s_mov_b32 s2, -1
7123; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7124; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7125; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7126; GFX6-NEXT:    s_waitcnt vmcnt(0)
7127; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7128; GFX6-NEXT:    s_waitcnt vmcnt(0)
7129; GFX6-NEXT:    buffer_wbinvl1
7130; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7131; GFX6-NEXT:    s_endpgm
7132;
7133; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7134; GFX7:       ; %bb.0: ; %entry
7135; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7136; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7138; GFX7-NEXT:    s_add_u32 s4, s0, 16
7139; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7140; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7141; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7142; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7143; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7144; GFX7-NEXT:    s_waitcnt vmcnt(0)
7145; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7146; GFX7-NEXT:    s_waitcnt vmcnt(0)
7147; GFX7-NEXT:    buffer_wbinvl1_vol
7148; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7149; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7150; GFX7-NEXT:    flat_store_dword v[0:1], v2
7151; GFX7-NEXT:    s_endpgm
7152;
7153; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7154; GFX10-WGP:       ; %bb.0: ; %entry
7155; GFX10-WGP-NEXT:    s_clause 0x1
7156; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7157; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7158; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
7159; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7160; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7161; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7162; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7163; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7164; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7165; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7166; GFX10-WGP-NEXT:    buffer_gl0_inv
7167; GFX10-WGP-NEXT:    buffer_gl1_inv
7168; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
7169; GFX10-WGP-NEXT:    s_endpgm
7170;
7171; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7172; GFX10-CU:       ; %bb.0: ; %entry
7173; GFX10-CU-NEXT:    s_clause 0x1
7174; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7175; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7176; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
7177; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7178; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7179; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7180; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7181; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7182; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7183; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7184; GFX10-CU-NEXT:    buffer_gl0_inv
7185; GFX10-CU-NEXT:    buffer_gl1_inv
7186; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
7187; GFX10-CU-NEXT:    s_endpgm
7188;
7189; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7190; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7191; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7192; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7193; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
7194; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
7195; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7196; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7197; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7198; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7199; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7200; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7201; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7202; SKIP-CACHE-INV-NEXT:    s_endpgm
7203;
7204; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7205; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7206; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7207; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7208; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7209; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7210; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7211; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7212; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7213; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7214; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7215; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7216; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7217; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7218; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7219;
7220; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
7221; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7222; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7223; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7224; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7225; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7226; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7227; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7228; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7229; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7230; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7231; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7232; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7233; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7234; GFX90A-TGSPLIT-NEXT:    s_endpgm
7235    i32 addrspace(1)* %out, i32 %in, i32 %old) {
7236entry:
7237  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
7238  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
7239  %val0 = extractvalue { i32, i1 } %val, 0
7240  store i32 %val0, i32 addrspace(1)* %out, align 4
7241  ret void
7242}
7243
7244define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
7245; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7246; GFX6:       ; %bb.0: ; %entry
7247; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7248; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
7249; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
7250; GFX6-NEXT:    s_mov_b32 s2, -1
7251; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7252; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7253; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7254; GFX6-NEXT:    s_waitcnt vmcnt(0)
7255; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7256; GFX6-NEXT:    s_waitcnt vmcnt(0)
7257; GFX6-NEXT:    buffer_wbinvl1
7258; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7259; GFX6-NEXT:    s_endpgm
7260;
7261; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7262; GFX7:       ; %bb.0: ; %entry
7263; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7264; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7265; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7266; GFX7-NEXT:    s_add_u32 s4, s0, 16
7267; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7268; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7269; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7270; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7271; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7272; GFX7-NEXT:    s_waitcnt vmcnt(0)
7273; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7274; GFX7-NEXT:    s_waitcnt vmcnt(0)
7275; GFX7-NEXT:    buffer_wbinvl1_vol
7276; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7277; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7278; GFX7-NEXT:    flat_store_dword v[0:1], v2
7279; GFX7-NEXT:    s_endpgm
7280;
7281; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7282; GFX10-WGP:       ; %bb.0: ; %entry
7283; GFX10-WGP-NEXT:    s_clause 0x1
7284; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7285; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7286; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
7287; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7289; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7290; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7291; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7292; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7293; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7294; GFX10-WGP-NEXT:    buffer_gl0_inv
7295; GFX10-WGP-NEXT:    buffer_gl1_inv
7296; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
7297; GFX10-WGP-NEXT:    s_endpgm
7298;
7299; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7300; GFX10-CU:       ; %bb.0: ; %entry
7301; GFX10-CU-NEXT:    s_clause 0x1
7302; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
7303; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
7304; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
7305; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7306; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7307; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7308; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7309; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7310; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
7311; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7312; GFX10-CU-NEXT:    buffer_gl0_inv
7313; GFX10-CU-NEXT:    buffer_gl1_inv
7314; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
7315; GFX10-CU-NEXT:    s_endpgm
7316;
7317; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7318; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7319; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7321; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
7322; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
7323; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7326; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7327; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7328; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7329; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7330; SKIP-CACHE-INV-NEXT:    s_endpgm
7331;
7332; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7333; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7334; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7335; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7336; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7337; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7338; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7339; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7340; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7341; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7342; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7343; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7344; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7345; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7346; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7347;
7348; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
7349; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7350; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7351; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7352; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
7353; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7354; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
7355; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7356; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7357; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7358; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7359; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7360; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7361; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
7362; GFX90A-TGSPLIT-NEXT:    s_endpgm
7363    i32 addrspace(1)* %out, i32 %in, i32 %old) {
7364entry:
7365  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
7366  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
7367  %val0 = extractvalue { i32, i1 } %val, 0
7368  store i32 %val0, i32 addrspace(1)* %out, align 4
7369  ret void
7370}
7371
7372