1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @local_system_unordered_load(
9; GFX6-LABEL: local_system_unordered_load:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
12; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
13; GFX6-NEXT:    s_mov_b32 m0, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    v_mov_b32_e32 v0, s0
16; GFX6-NEXT:    ds_read_b32 v0, v0
17; GFX6-NEXT:    v_mov_b32_e32 v1, s1
18; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX6-NEXT:    ds_write_b32 v1, v0
20; GFX6-NEXT:    s_endpgm
21;
22; GFX7-LABEL: local_system_unordered_load:
23; GFX7:       ; %bb.0: ; %entry
24; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
25; GFX7-NEXT:    s_mov_b32 m0, -1
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    v_mov_b32_e32 v0, s0
28; GFX7-NEXT:    ds_read_b32 v0, v0
29; GFX7-NEXT:    v_mov_b32_e32 v1, s1
30; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX7-NEXT:    ds_write_b32 v1, v0
32; GFX7-NEXT:    s_endpgm
33;
34; GFX10-WGP-LABEL: local_system_unordered_load:
35; GFX10-WGP:       ; %bb.0: ; %entry
36; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
37; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
39; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
40; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
41; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
43; GFX10-WGP-NEXT:    s_endpgm
44;
45; GFX10-CU-LABEL: local_system_unordered_load:
46; GFX10-CU:       ; %bb.0: ; %entry
47; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
48; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
50; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
51; GFX10-CU-NEXT:    ds_read_b32 v0, v0
52; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX10-CU-NEXT:    ds_write_b32 v1, v0
54; GFX10-CU-NEXT:    s_endpgm
55;
56; SKIP-CACHE-INV-LABEL: local_system_unordered_load:
57; SKIP-CACHE-INV:       ; %bb.0: ; %entry
58; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
59; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
60; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
61; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
62; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
64; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
65; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
66; SKIP-CACHE-INV-NEXT:    s_endpgm
67    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
68entry:
69  %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4
70  store i32 %val, i32 addrspace(3)* %out
71  ret void
72}
73
74define amdgpu_kernel void @local_system_monotonic_load(
75; GFX6-LABEL: local_system_monotonic_load:
76; GFX6:       ; %bb.0: ; %entry
77; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
78; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
79; GFX6-NEXT:    s_mov_b32 m0, -1
80; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX6-NEXT:    v_mov_b32_e32 v0, s0
82; GFX6-NEXT:    ds_read_b32 v0, v0
83; GFX6-NEXT:    v_mov_b32_e32 v1, s1
84; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX6-NEXT:    ds_write_b32 v1, v0
86; GFX6-NEXT:    s_endpgm
87;
88; GFX7-LABEL: local_system_monotonic_load:
89; GFX7:       ; %bb.0: ; %entry
90; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
91; GFX7-NEXT:    s_mov_b32 m0, -1
92; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX7-NEXT:    v_mov_b32_e32 v0, s0
94; GFX7-NEXT:    ds_read_b32 v0, v0
95; GFX7-NEXT:    v_mov_b32_e32 v1, s1
96; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX7-NEXT:    ds_write_b32 v1, v0
98; GFX7-NEXT:    s_endpgm
99;
100; GFX10-WGP-LABEL: local_system_monotonic_load:
101; GFX10-WGP:       ; %bb.0: ; %entry
102; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
103; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
105; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
106; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
107; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
109; GFX10-WGP-NEXT:    s_endpgm
110;
111; GFX10-CU-LABEL: local_system_monotonic_load:
112; GFX10-CU:       ; %bb.0: ; %entry
113; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
114; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
116; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
117; GFX10-CU-NEXT:    ds_read_b32 v0, v0
118; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
119; GFX10-CU-NEXT:    ds_write_b32 v1, v0
120; GFX10-CU-NEXT:    s_endpgm
121;
122; SKIP-CACHE-INV-LABEL: local_system_monotonic_load:
123; SKIP-CACHE-INV:       ; %bb.0: ; %entry
124; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
125; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
126; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
127; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
128; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
129; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
130; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
131; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
132; SKIP-CACHE-INV-NEXT:    s_endpgm
133    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
134entry:
135  %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4
136  store i32 %val, i32 addrspace(3)* %out
137  ret void
138}
139
140define amdgpu_kernel void @local_system_acquire_load(
141; GFX6-LABEL: local_system_acquire_load:
142; GFX6:       ; %bb.0: ; %entry
143; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
144; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
145; GFX6-NEXT:    s_mov_b32 m0, -1
146; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX6-NEXT:    v_mov_b32_e32 v0, s0
148; GFX6-NEXT:    ds_read_b32 v0, v0
149; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX6-NEXT:    buffer_wbinvl1
151; GFX6-NEXT:    v_mov_b32_e32 v1, s1
152; GFX6-NEXT:    ds_write_b32 v1, v0
153; GFX6-NEXT:    s_endpgm
154;
155; GFX7-LABEL: local_system_acquire_load:
156; GFX7:       ; %bb.0: ; %entry
157; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
158; GFX7-NEXT:    s_mov_b32 m0, -1
159; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX7-NEXT:    v_mov_b32_e32 v0, s0
161; GFX7-NEXT:    ds_read_b32 v0, v0
162; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX7-NEXT:    buffer_wbinvl1_vol
164; GFX7-NEXT:    v_mov_b32_e32 v1, s1
165; GFX7-NEXT:    ds_write_b32 v1, v0
166; GFX7-NEXT:    s_endpgm
167;
168; GFX10-WGP-LABEL: local_system_acquire_load:
169; GFX10-WGP:       ; %bb.0: ; %entry
170; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
171; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
173; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
174; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
175; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX10-WGP-NEXT:    buffer_gl0_inv
177; GFX10-WGP-NEXT:    buffer_gl1_inv
178; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
179; GFX10-WGP-NEXT:    s_endpgm
180;
181; GFX10-CU-LABEL: local_system_acquire_load:
182; GFX10-CU:       ; %bb.0: ; %entry
183; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
184; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
186; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
187; GFX10-CU-NEXT:    ds_read_b32 v0, v0
188; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX10-CU-NEXT:    buffer_gl0_inv
190; GFX10-CU-NEXT:    buffer_gl1_inv
191; GFX10-CU-NEXT:    ds_write_b32 v1, v0
192; GFX10-CU-NEXT:    s_endpgm
193;
194; SKIP-CACHE-INV-LABEL: local_system_acquire_load:
195; SKIP-CACHE-INV:       ; %bb.0: ; %entry
196; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
197; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
198; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
199; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
200; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
201; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
202; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
203; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
204; SKIP-CACHE-INV-NEXT:    s_endpgm
205    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
206entry:
207  %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4
208  store i32 %val, i32 addrspace(3)* %out
209  ret void
210}
211
212define amdgpu_kernel void @local_system_seq_cst_load(
213; GFX6-LABEL: local_system_seq_cst_load:
214; GFX6:       ; %bb.0: ; %entry
215; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
216; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
217; GFX6-NEXT:    s_mov_b32 m0, -1
218; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX6-NEXT:    v_mov_b32_e32 v0, s0
220; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
221; GFX6-NEXT:    ds_read_b32 v0, v0
222; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX6-NEXT:    buffer_wbinvl1
224; GFX6-NEXT:    v_mov_b32_e32 v1, s1
225; GFX6-NEXT:    ds_write_b32 v1, v0
226; GFX6-NEXT:    s_endpgm
227;
228; GFX7-LABEL: local_system_seq_cst_load:
229; GFX7:       ; %bb.0: ; %entry
230; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
231; GFX7-NEXT:    s_mov_b32 m0, -1
232; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX7-NEXT:    v_mov_b32_e32 v0, s0
234; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
235; GFX7-NEXT:    ds_read_b32 v0, v0
236; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX7-NEXT:    buffer_wbinvl1_vol
238; GFX7-NEXT:    v_mov_b32_e32 v1, s1
239; GFX7-NEXT:    ds_write_b32 v1, v0
240; GFX7-NEXT:    s_endpgm
241;
242; GFX10-WGP-LABEL: local_system_seq_cst_load:
243; GFX10-WGP:       ; %bb.0: ; %entry
244; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
245; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
247; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
248; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
249; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
250; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
251; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-WGP-NEXT:    buffer_gl0_inv
253; GFX10-WGP-NEXT:    buffer_gl1_inv
254; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
255; GFX10-WGP-NEXT:    s_endpgm
256;
257; GFX10-CU-LABEL: local_system_seq_cst_load:
258; GFX10-CU:       ; %bb.0: ; %entry
259; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
260; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
262; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
263; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
264; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
265; GFX10-CU-NEXT:    ds_read_b32 v0, v0
266; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX10-CU-NEXT:    buffer_gl0_inv
268; GFX10-CU-NEXT:    buffer_gl1_inv
269; GFX10-CU-NEXT:    ds_write_b32 v1, v0
270; GFX10-CU-NEXT:    s_endpgm
271;
272; SKIP-CACHE-INV-LABEL: local_system_seq_cst_load:
273; SKIP-CACHE-INV:       ; %bb.0: ; %entry
274; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
275; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
276; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
277; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
278; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
279; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
280; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
282; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
283; SKIP-CACHE-INV-NEXT:    s_endpgm
284    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
285entry:
286  %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4
287  store i32 %val, i32 addrspace(3)* %out
288  ret void
289}
290
291define amdgpu_kernel void @local_system_unordered_store(
292; GFX6-LABEL: local_system_unordered_store:
293; GFX6:       ; %bb.0: ; %entry
294; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
295; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
296; GFX6-NEXT:    s_mov_b32 m0, -1
297; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX6-NEXT:    v_mov_b32_e32 v1, s0
299; GFX6-NEXT:    v_mov_b32_e32 v0, s1
300; GFX6-NEXT:    ds_write_b32 v0, v1
301; GFX6-NEXT:    s_endpgm
302;
303; GFX7-LABEL: local_system_unordered_store:
304; GFX7:       ; %bb.0: ; %entry
305; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
306; GFX7-NEXT:    s_mov_b32 m0, -1
307; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX7-NEXT:    v_mov_b32_e32 v0, s1
309; GFX7-NEXT:    v_mov_b32_e32 v1, s0
310; GFX7-NEXT:    ds_write_b32 v0, v1
311; GFX7-NEXT:    s_endpgm
312;
313; GFX10-WGP-LABEL: local_system_unordered_store:
314; GFX10-WGP:       ; %bb.0: ; %entry
315; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
316; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
318; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
319; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
320; GFX10-WGP-NEXT:    s_endpgm
321;
322; GFX10-CU-LABEL: local_system_unordered_store:
323; GFX10-CU:       ; %bb.0: ; %entry
324; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
325; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
327; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
328; GFX10-CU-NEXT:    ds_write_b32 v0, v1
329; GFX10-CU-NEXT:    s_endpgm
330;
331; SKIP-CACHE-INV-LABEL: local_system_unordered_store:
332; SKIP-CACHE-INV:       ; %bb.0: ; %entry
333; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
334; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
335; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
336; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
338; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
339; SKIP-CACHE-INV-NEXT:    s_endpgm
340    i32 %in, i32 addrspace(3)* %out) {
341entry:
342  store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4
343  ret void
344}
345
346define amdgpu_kernel void @local_system_monotonic_store(
347; GFX6-LABEL: local_system_monotonic_store:
348; GFX6:       ; %bb.0: ; %entry
349; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
350; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
351; GFX6-NEXT:    s_mov_b32 m0, -1
352; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX6-NEXT:    v_mov_b32_e32 v1, s0
354; GFX6-NEXT:    v_mov_b32_e32 v0, s1
355; GFX6-NEXT:    ds_write_b32 v0, v1
356; GFX6-NEXT:    s_endpgm
357;
358; GFX7-LABEL: local_system_monotonic_store:
359; GFX7:       ; %bb.0: ; %entry
360; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
361; GFX7-NEXT:    s_mov_b32 m0, -1
362; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX7-NEXT:    v_mov_b32_e32 v0, s1
364; GFX7-NEXT:    v_mov_b32_e32 v1, s0
365; GFX7-NEXT:    ds_write_b32 v0, v1
366; GFX7-NEXT:    s_endpgm
367;
368; GFX10-WGP-LABEL: local_system_monotonic_store:
369; GFX10-WGP:       ; %bb.0: ; %entry
370; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
371; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
373; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
374; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
375; GFX10-WGP-NEXT:    s_endpgm
376;
377; GFX10-CU-LABEL: local_system_monotonic_store:
378; GFX10-CU:       ; %bb.0: ; %entry
379; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
380; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
382; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
383; GFX10-CU-NEXT:    ds_write_b32 v0, v1
384; GFX10-CU-NEXT:    s_endpgm
385;
386; SKIP-CACHE-INV-LABEL: local_system_monotonic_store:
387; SKIP-CACHE-INV:       ; %bb.0: ; %entry
388; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
389; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
390; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
393; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
394; SKIP-CACHE-INV-NEXT:    s_endpgm
395    i32 %in, i32 addrspace(3)* %out) {
396entry:
397  store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4
398  ret void
399}
400
401define amdgpu_kernel void @local_system_release_store(
402; GFX6-LABEL: local_system_release_store:
403; GFX6:       ; %bb.0: ; %entry
404; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
405; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
406; GFX6-NEXT:    s_mov_b32 m0, -1
407; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX6-NEXT:    v_mov_b32_e32 v1, s0
409; GFX6-NEXT:    v_mov_b32_e32 v0, s1
410; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
411; GFX6-NEXT:    ds_write_b32 v0, v1
412; GFX6-NEXT:    s_endpgm
413;
414; GFX7-LABEL: local_system_release_store:
415; GFX7:       ; %bb.0: ; %entry
416; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
417; GFX7-NEXT:    s_mov_b32 m0, -1
418; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX7-NEXT:    v_mov_b32_e32 v0, s1
420; GFX7-NEXT:    v_mov_b32_e32 v1, s0
421; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
422; GFX7-NEXT:    ds_write_b32 v0, v1
423; GFX7-NEXT:    s_endpgm
424;
425; GFX10-WGP-LABEL: local_system_release_store:
426; GFX10-WGP:       ; %bb.0: ; %entry
427; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
428; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
430; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
431; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
432; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
433; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
434; GFX10-WGP-NEXT:    s_endpgm
435;
436; GFX10-CU-LABEL: local_system_release_store:
437; GFX10-CU:       ; %bb.0: ; %entry
438; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
439; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
440; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
441; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
442; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
443; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
444; GFX10-CU-NEXT:    ds_write_b32 v0, v1
445; GFX10-CU-NEXT:    s_endpgm
446;
447; SKIP-CACHE-INV-LABEL: local_system_release_store:
448; SKIP-CACHE-INV:       ; %bb.0: ; %entry
449; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
450; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
451; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
452; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
454; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
455; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
456; SKIP-CACHE-INV-NEXT:    s_endpgm
457    i32 %in, i32 addrspace(3)* %out) {
458entry:
459  store atomic i32 %in, i32 addrspace(3)* %out release, align 4
460  ret void
461}
462
463define amdgpu_kernel void @local_system_seq_cst_store(
464; GFX6-LABEL: local_system_seq_cst_store:
465; GFX6:       ; %bb.0: ; %entry
466; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
467; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
468; GFX6-NEXT:    s_mov_b32 m0, -1
469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX6-NEXT:    v_mov_b32_e32 v1, s0
471; GFX6-NEXT:    v_mov_b32_e32 v0, s1
472; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
473; GFX6-NEXT:    ds_write_b32 v0, v1
474; GFX6-NEXT:    s_endpgm
475;
476; GFX7-LABEL: local_system_seq_cst_store:
477; GFX7:       ; %bb.0: ; %entry
478; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
479; GFX7-NEXT:    s_mov_b32 m0, -1
480; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX7-NEXT:    v_mov_b32_e32 v0, s1
482; GFX7-NEXT:    v_mov_b32_e32 v1, s0
483; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
484; GFX7-NEXT:    ds_write_b32 v0, v1
485; GFX7-NEXT:    s_endpgm
486;
487; GFX10-WGP-LABEL: local_system_seq_cst_store:
488; GFX10-WGP:       ; %bb.0: ; %entry
489; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
490; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
492; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
493; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
494; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
495; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
496; GFX10-WGP-NEXT:    s_endpgm
497;
498; GFX10-CU-LABEL: local_system_seq_cst_store:
499; GFX10-CU:       ; %bb.0: ; %entry
500; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
501; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
503; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
504; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
505; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
506; GFX10-CU-NEXT:    ds_write_b32 v0, v1
507; GFX10-CU-NEXT:    s_endpgm
508;
509; SKIP-CACHE-INV-LABEL: local_system_seq_cst_store:
510; SKIP-CACHE-INV:       ; %bb.0: ; %entry
511; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
512; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
513; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
516; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
517; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
518; SKIP-CACHE-INV-NEXT:    s_endpgm
519    i32 %in, i32 addrspace(3)* %out) {
520entry:
521  store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4
522  ret void
523}
524
525define amdgpu_kernel void @local_system_monotonic_atomicrmw(
526; GFX6-LABEL: local_system_monotonic_atomicrmw:
527; GFX6:       ; %bb.0: ; %entry
528; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
529; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
530; GFX6-NEXT:    s_mov_b32 m0, -1
531; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX6-NEXT:    v_mov_b32_e32 v0, s0
533; GFX6-NEXT:    v_mov_b32_e32 v1, s1
534; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
535; GFX6-NEXT:    s_endpgm
536;
537; GFX7-LABEL: local_system_monotonic_atomicrmw:
538; GFX7:       ; %bb.0: ; %entry
539; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
540; GFX7-NEXT:    s_mov_b32 m0, -1
541; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX7-NEXT:    v_mov_b32_e32 v0, s0
543; GFX7-NEXT:    v_mov_b32_e32 v1, s1
544; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
545; GFX7-NEXT:    s_endpgm
546;
547; GFX10-WGP-LABEL: local_system_monotonic_atomicrmw:
548; GFX10-WGP:       ; %bb.0: ; %entry
549; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
550; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
552; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
553; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
554; GFX10-WGP-NEXT:    s_endpgm
555;
556; GFX10-CU-LABEL: local_system_monotonic_atomicrmw:
557; GFX10-CU:       ; %bb.0: ; %entry
558; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
559; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
561; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
562; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
563; GFX10-CU-NEXT:    s_endpgm
564;
565; SKIP-CACHE-INV-LABEL: local_system_monotonic_atomicrmw:
566; SKIP-CACHE-INV:       ; %bb.0: ; %entry
567; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
568; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
569; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
571; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
572; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
573; SKIP-CACHE-INV-NEXT:    s_endpgm
574    i32 addrspace(3)* %out, i32 %in) {
575entry:
576  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic
577  ret void
578}
579
580define amdgpu_kernel void @local_system_acquire_atomicrmw(
581; GFX6-LABEL: local_system_acquire_atomicrmw:
582; GFX6:       ; %bb.0: ; %entry
583; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
584; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
585; GFX6-NEXT:    s_mov_b32 m0, -1
586; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX6-NEXT:    v_mov_b32_e32 v0, s0
588; GFX6-NEXT:    v_mov_b32_e32 v1, s1
589; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
590; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
591; GFX6-NEXT:    buffer_wbinvl1
592; GFX6-NEXT:    s_endpgm
593;
594; GFX7-LABEL: local_system_acquire_atomicrmw:
595; GFX7:       ; %bb.0: ; %entry
596; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
597; GFX7-NEXT:    s_mov_b32 m0, -1
598; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX7-NEXT:    v_mov_b32_e32 v0, s0
600; GFX7-NEXT:    v_mov_b32_e32 v1, s1
601; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
602; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
603; GFX7-NEXT:    buffer_wbinvl1_vol
604; GFX7-NEXT:    s_endpgm
605;
606; GFX10-WGP-LABEL: local_system_acquire_atomicrmw:
607; GFX10-WGP:       ; %bb.0: ; %entry
608; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
609; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
611; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
612; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
613; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
615; GFX10-WGP-NEXT:    buffer_gl0_inv
616; GFX10-WGP-NEXT:    buffer_gl1_inv
617; GFX10-WGP-NEXT:    s_endpgm
618;
619; GFX10-CU-LABEL: local_system_acquire_atomicrmw:
620; GFX10-CU:       ; %bb.0: ; %entry
621; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
622; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
624; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
625; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
626; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
628; GFX10-CU-NEXT:    buffer_gl0_inv
629; GFX10-CU-NEXT:    buffer_gl1_inv
630; GFX10-CU-NEXT:    s_endpgm
631;
632; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw:
633; SKIP-CACHE-INV:       ; %bb.0: ; %entry
634; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
635; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
636; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
639; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
640; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
641; SKIP-CACHE-INV-NEXT:    s_endpgm
642    i32 addrspace(3)* %out, i32 %in) {
643entry:
644  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
645  ret void
646}
647
648define amdgpu_kernel void @local_system_release_atomicrmw(
649; GFX6-LABEL: local_system_release_atomicrmw:
650; GFX6:       ; %bb.0: ; %entry
651; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
652; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
653; GFX6-NEXT:    s_mov_b32 m0, -1
654; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX6-NEXT:    v_mov_b32_e32 v0, s0
656; GFX6-NEXT:    v_mov_b32_e32 v1, s1
657; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
658; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
659; GFX6-NEXT:    s_endpgm
660;
661; GFX7-LABEL: local_system_release_atomicrmw:
662; GFX7:       ; %bb.0: ; %entry
663; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
664; GFX7-NEXT:    s_mov_b32 m0, -1
665; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
666; GFX7-NEXT:    v_mov_b32_e32 v0, s0
667; GFX7-NEXT:    v_mov_b32_e32 v1, s1
668; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
669; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
670; GFX7-NEXT:    s_endpgm
671;
672; GFX10-WGP-LABEL: local_system_release_atomicrmw:
673; GFX10-WGP:       ; %bb.0: ; %entry
674; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
675; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
677; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
678; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
679; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
680; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
681; GFX10-WGP-NEXT:    s_endpgm
682;
683; GFX10-CU-LABEL: local_system_release_atomicrmw:
684; GFX10-CU:       ; %bb.0: ; %entry
685; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
686; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
688; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
689; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
690; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
691; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
692; GFX10-CU-NEXT:    s_endpgm
693;
694; SKIP-CACHE-INV-LABEL: local_system_release_atomicrmw:
695; SKIP-CACHE-INV:       ; %bb.0: ; %entry
696; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
697; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
698; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
699; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
700; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
701; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
702; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
703; SKIP-CACHE-INV-NEXT:    s_endpgm
704    i32 addrspace(3)* %out, i32 %in) {
705entry:
706  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release
707  ret void
708}
709
710define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
711; GFX6-LABEL: local_system_acq_rel_atomicrmw:
712; GFX6:       ; %bb.0: ; %entry
713; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
714; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
715; GFX6-NEXT:    s_mov_b32 m0, -1
716; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX6-NEXT:    v_mov_b32_e32 v0, s0
718; GFX6-NEXT:    v_mov_b32_e32 v1, s1
719; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
720; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
721; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
722; GFX6-NEXT:    buffer_wbinvl1
723; GFX6-NEXT:    s_endpgm
724;
725; GFX7-LABEL: local_system_acq_rel_atomicrmw:
726; GFX7:       ; %bb.0: ; %entry
727; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
728; GFX7-NEXT:    s_mov_b32 m0, -1
729; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX7-NEXT:    v_mov_b32_e32 v0, s0
731; GFX7-NEXT:    v_mov_b32_e32 v1, s1
732; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
733; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
734; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
735; GFX7-NEXT:    buffer_wbinvl1_vol
736; GFX7-NEXT:    s_endpgm
737;
738; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw:
739; GFX10-WGP:       ; %bb.0: ; %entry
740; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
741; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
743; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
744; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
745; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
746; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
747; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
749; GFX10-WGP-NEXT:    buffer_gl0_inv
750; GFX10-WGP-NEXT:    buffer_gl1_inv
751; GFX10-WGP-NEXT:    s_endpgm
752;
753; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw:
754; GFX10-CU:       ; %bb.0: ; %entry
755; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
756; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
758; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
759; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
760; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
761; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
762; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
764; GFX10-CU-NEXT:    buffer_gl0_inv
765; GFX10-CU-NEXT:    buffer_gl1_inv
766; GFX10-CU-NEXT:    s_endpgm
767;
768; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw:
769; SKIP-CACHE-INV:       ; %bb.0: ; %entry
770; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
771; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
772; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
775; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
776; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
777; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
778; SKIP-CACHE-INV-NEXT:    s_endpgm
779    i32 addrspace(3)* %out, i32 %in) {
780entry:
781  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
782  ret void
783}
784
785define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
786; GFX6-LABEL: local_system_seq_cst_atomicrmw:
787; GFX6:       ; %bb.0: ; %entry
788; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
789; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
790; GFX6-NEXT:    s_mov_b32 m0, -1
791; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX6-NEXT:    v_mov_b32_e32 v0, s0
793; GFX6-NEXT:    v_mov_b32_e32 v1, s1
794; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
795; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
796; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
797; GFX6-NEXT:    buffer_wbinvl1
798; GFX6-NEXT:    s_endpgm
799;
800; GFX7-LABEL: local_system_seq_cst_atomicrmw:
801; GFX7:       ; %bb.0: ; %entry
802; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
803; GFX7-NEXT:    s_mov_b32 m0, -1
804; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX7-NEXT:    v_mov_b32_e32 v0, s0
806; GFX7-NEXT:    v_mov_b32_e32 v1, s1
807; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
808; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
809; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
810; GFX7-NEXT:    buffer_wbinvl1_vol
811; GFX7-NEXT:    s_endpgm
812;
813; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw:
814; GFX10-WGP:       ; %bb.0: ; %entry
815; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
816; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
818; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
819; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
820; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
821; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
822; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
824; GFX10-WGP-NEXT:    buffer_gl0_inv
825; GFX10-WGP-NEXT:    buffer_gl1_inv
826; GFX10-WGP-NEXT:    s_endpgm
827;
828; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw:
829; GFX10-CU:       ; %bb.0: ; %entry
830; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
831; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
833; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
834; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
835; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
836; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
837; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
838; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
839; GFX10-CU-NEXT:    buffer_gl0_inv
840; GFX10-CU-NEXT:    buffer_gl1_inv
841; GFX10-CU-NEXT:    s_endpgm
842;
843; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw:
844; SKIP-CACHE-INV:       ; %bb.0: ; %entry
845; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
846; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
847; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
848; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
849; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
850; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
851; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
852; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
853; SKIP-CACHE-INV-NEXT:    s_endpgm
854    i32 addrspace(3)* %out, i32 %in) {
855entry:
856  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
857  ret void
858}
859
860define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
861; GFX6-LABEL: local_system_acquire_ret_atomicrmw:
862; GFX6:       ; %bb.0: ; %entry
863; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
864; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
865; GFX6-NEXT:    s_mov_b32 m0, -1
866; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX6-NEXT:    v_mov_b32_e32 v0, s0
868; GFX6-NEXT:    v_mov_b32_e32 v1, s1
869; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
870; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
871; GFX6-NEXT:    buffer_wbinvl1
872; GFX6-NEXT:    ds_write_b32 v0, v1
873; GFX6-NEXT:    s_endpgm
874;
875; GFX7-LABEL: local_system_acquire_ret_atomicrmw:
876; GFX7:       ; %bb.0: ; %entry
877; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
878; GFX7-NEXT:    s_mov_b32 m0, -1
879; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX7-NEXT:    v_mov_b32_e32 v0, s0
881; GFX7-NEXT:    v_mov_b32_e32 v1, s1
882; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
883; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
884; GFX7-NEXT:    buffer_wbinvl1_vol
885; GFX7-NEXT:    ds_write_b32 v0, v1
886; GFX7-NEXT:    s_endpgm
887;
888; GFX10-WGP-LABEL: local_system_acquire_ret_atomicrmw:
889; GFX10-WGP:       ; %bb.0: ; %entry
890; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
891; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
892; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
893; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
894; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
895; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
897; GFX10-WGP-NEXT:    buffer_gl0_inv
898; GFX10-WGP-NEXT:    buffer_gl1_inv
899; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
900; GFX10-WGP-NEXT:    s_endpgm
901;
902; GFX10-CU-LABEL: local_system_acquire_ret_atomicrmw:
903; GFX10-CU:       ; %bb.0: ; %entry
904; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
905; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
907; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
908; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
909; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
911; GFX10-CU-NEXT:    buffer_gl0_inv
912; GFX10-CU-NEXT:    buffer_gl1_inv
913; GFX10-CU-NEXT:    ds_write_b32 v0, v1
914; GFX10-CU-NEXT:    s_endpgm
915;
916; SKIP-CACHE-INV-LABEL: local_system_acquire_ret_atomicrmw:
917; SKIP-CACHE-INV:       ; %bb.0: ; %entry
918; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
919; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
920; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
922; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
923; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
924; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
925; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
926; SKIP-CACHE-INV-NEXT:    s_endpgm
927    i32 addrspace(3)* %out, i32 %in) {
928entry:
929  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
930  store i32 %val, i32 addrspace(3)* %out, align 4
931  ret void
932}
933
934define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
935; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw:
936; GFX6:       ; %bb.0: ; %entry
937; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
938; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
939; GFX6-NEXT:    s_mov_b32 m0, -1
940; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX6-NEXT:    v_mov_b32_e32 v0, s0
942; GFX6-NEXT:    v_mov_b32_e32 v1, s1
943; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
944; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
945; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
946; GFX6-NEXT:    buffer_wbinvl1
947; GFX6-NEXT:    ds_write_b32 v0, v1
948; GFX6-NEXT:    s_endpgm
949;
950; GFX7-LABEL: local_system_acq_rel_ret_atomicrmw:
951; GFX7:       ; %bb.0: ; %entry
952; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
953; GFX7-NEXT:    s_mov_b32 m0, -1
954; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX7-NEXT:    v_mov_b32_e32 v0, s0
956; GFX7-NEXT:    v_mov_b32_e32 v1, s1
957; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
958; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
959; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
960; GFX7-NEXT:    buffer_wbinvl1_vol
961; GFX7-NEXT:    ds_write_b32 v0, v1
962; GFX7-NEXT:    s_endpgm
963;
964; GFX10-WGP-LABEL: local_system_acq_rel_ret_atomicrmw:
965; GFX10-WGP:       ; %bb.0: ; %entry
966; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
967; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
969; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
970; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
971; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
972; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
973; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
975; GFX10-WGP-NEXT:    buffer_gl0_inv
976; GFX10-WGP-NEXT:    buffer_gl1_inv
977; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
978; GFX10-WGP-NEXT:    s_endpgm
979;
980; GFX10-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
981; GFX10-CU:       ; %bb.0: ; %entry
982; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
983; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
985; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
986; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
987; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
988; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
989; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
991; GFX10-CU-NEXT:    buffer_gl0_inv
992; GFX10-CU-NEXT:    buffer_gl1_inv
993; GFX10-CU-NEXT:    ds_write_b32 v0, v1
994; GFX10-CU-NEXT:    s_endpgm
995;
996; SKIP-CACHE-INV-LABEL: local_system_acq_rel_ret_atomicrmw:
997; SKIP-CACHE-INV:       ; %bb.0: ; %entry
998; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
999; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1000; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1001; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1002; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1003; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1004; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1005; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1006; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1007; SKIP-CACHE-INV-NEXT:    s_endpgm
1008    i32 addrspace(3)* %out, i32 %in) {
1009entry:
1010  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
1011  store i32 %val, i32 addrspace(3)* %out, align 4
1012  ret void
1013}
1014
1015define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
1016; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw:
1017; GFX6:       ; %bb.0: ; %entry
1018; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1019; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1020; GFX6-NEXT:    s_mov_b32 m0, -1
1021; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1023; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1024; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1025; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1026; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1027; GFX6-NEXT:    buffer_wbinvl1
1028; GFX6-NEXT:    ds_write_b32 v0, v1
1029; GFX6-NEXT:    s_endpgm
1030;
1031; GFX7-LABEL: local_system_seq_cst_ret_atomicrmw:
1032; GFX7:       ; %bb.0: ; %entry
1033; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1034; GFX7-NEXT:    s_mov_b32 m0, -1
1035; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1037; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1038; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1039; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1040; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1041; GFX7-NEXT:    buffer_wbinvl1_vol
1042; GFX7-NEXT:    ds_write_b32 v0, v1
1043; GFX7-NEXT:    s_endpgm
1044;
1045; GFX10-WGP-LABEL: local_system_seq_cst_ret_atomicrmw:
1046; GFX10-WGP:       ; %bb.0: ; %entry
1047; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1048; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1050; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1051; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1052; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1053; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1054; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1056; GFX10-WGP-NEXT:    buffer_gl0_inv
1057; GFX10-WGP-NEXT:    buffer_gl1_inv
1058; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1059; GFX10-WGP-NEXT:    s_endpgm
1060;
1061; GFX10-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
1062; GFX10-CU:       ; %bb.0: ; %entry
1063; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1064; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1066; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1067; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1068; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1069; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1070; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1072; GFX10-CU-NEXT:    buffer_gl0_inv
1073; GFX10-CU-NEXT:    buffer_gl1_inv
1074; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1075; GFX10-CU-NEXT:    s_endpgm
1076;
1077; SKIP-CACHE-INV-LABEL: local_system_seq_cst_ret_atomicrmw:
1078; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1079; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1080; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1081; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1082; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1083; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1084; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1085; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1086; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1087; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1088; SKIP-CACHE-INV-NEXT:    s_endpgm
1089    i32 addrspace(3)* %out, i32 %in) {
1090entry:
1091  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
1092  store i32 %val, i32 addrspace(3)* %out, align 4
1093  ret void
1094}
1095
1096define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
1097; GFX6-LABEL: local_system_monotonic_monotonic_cmpxchg:
1098; GFX6:       ; %bb.0: ; %entry
1099; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1100; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1101; GFX6-NEXT:    s_mov_b32 m0, -1
1102; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1104; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1105; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1106; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1107; GFX6-NEXT:    s_endpgm
1108;
1109; GFX7-LABEL: local_system_monotonic_monotonic_cmpxchg:
1110; GFX7:       ; %bb.0: ; %entry
1111; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1112; GFX7-NEXT:    s_mov_b32 m0, -1
1113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1115; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1116; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1117; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1118; GFX7-NEXT:    s_endpgm
1119;
1120; GFX10-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
1121; GFX10-WGP:       ; %bb.0: ; %entry
1122; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1123; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1125; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1126; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1127; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1128; GFX10-WGP-NEXT:    s_endpgm
1129;
1130; GFX10-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
1131; GFX10-CU:       ; %bb.0: ; %entry
1132; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1133; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1135; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1136; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1137; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1138; GFX10-CU-NEXT:    s_endpgm
1139;
1140; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_cmpxchg:
1141; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1142; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1143; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1144; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1145; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1146; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1147; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1148; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1149; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1150; SKIP-CACHE-INV-NEXT:    s_endpgm
1151    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1152entry:
1153  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1154  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic monotonic
1155  ret void
1156}
1157
1158define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
1159; GFX6-LABEL: local_system_acquire_monotonic_cmpxchg:
1160; GFX6:       ; %bb.0: ; %entry
1161; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1162; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1163; GFX6-NEXT:    s_mov_b32 m0, -1
1164; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1166; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1167; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1168; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1169; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1170; GFX6-NEXT:    buffer_wbinvl1
1171; GFX6-NEXT:    s_endpgm
1172;
1173; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg:
1174; GFX7:       ; %bb.0: ; %entry
1175; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1176; GFX7-NEXT:    s_mov_b32 m0, -1
1177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1179; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1180; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1181; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1182; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1183; GFX7-NEXT:    buffer_wbinvl1_vol
1184; GFX7-NEXT:    s_endpgm
1185;
1186; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
1187; GFX10-WGP:       ; %bb.0: ; %entry
1188; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1189; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1191; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1192; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1193; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1194; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1196; GFX10-WGP-NEXT:    buffer_gl0_inv
1197; GFX10-WGP-NEXT:    buffer_gl1_inv
1198; GFX10-WGP-NEXT:    s_endpgm
1199;
1200; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
1201; GFX10-CU:       ; %bb.0: ; %entry
1202; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1203; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1205; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1206; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1207; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1208; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1210; GFX10-CU-NEXT:    buffer_gl0_inv
1211; GFX10-CU-NEXT:    buffer_gl1_inv
1212; GFX10-CU-NEXT:    s_endpgm
1213;
1214; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg:
1215; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1216; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1217; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1218; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1219; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1220; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1223; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1224; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1225; SKIP-CACHE-INV-NEXT:    s_endpgm
1226    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1227entry:
1228  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1229  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic
1230  ret void
1231}
1232
1233define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
1234; GFX6-LABEL: local_system_release_monotonic_cmpxchg:
1235; GFX6:       ; %bb.0: ; %entry
1236; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1237; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1238; GFX6-NEXT:    s_mov_b32 m0, -1
1239; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1240; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1241; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1242; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1243; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1244; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1245; GFX6-NEXT:    s_endpgm
1246;
1247; GFX7-LABEL: local_system_release_monotonic_cmpxchg:
1248; GFX7:       ; %bb.0: ; %entry
1249; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1250; GFX7-NEXT:    s_mov_b32 m0, -1
1251; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1253; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1254; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1255; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1256; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1257; GFX7-NEXT:    s_endpgm
1258;
1259; GFX10-WGP-LABEL: local_system_release_monotonic_cmpxchg:
1260; GFX10-WGP:       ; %bb.0: ; %entry
1261; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1262; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1264; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1265; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1266; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1267; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1268; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1269; GFX10-WGP-NEXT:    s_endpgm
1270;
1271; GFX10-CU-LABEL: local_system_release_monotonic_cmpxchg:
1272; GFX10-CU:       ; %bb.0: ; %entry
1273; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1274; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1276; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1277; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1278; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1279; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1280; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1281; GFX10-CU-NEXT:    s_endpgm
1282;
1283; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_cmpxchg:
1284; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1285; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1286; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1287; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1288; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1289; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1290; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1291; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1292; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1293; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1294; SKIP-CACHE-INV-NEXT:    s_endpgm
1295    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1296entry:
1297  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1298  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release monotonic
1299  ret void
1300}
1301
1302define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
1303; GFX6-LABEL: local_system_acq_rel_monotonic_cmpxchg:
1304; GFX6:       ; %bb.0: ; %entry
1305; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1306; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1307; GFX6-NEXT:    s_mov_b32 m0, -1
1308; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1310; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1311; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1312; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1313; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1314; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1315; GFX6-NEXT:    buffer_wbinvl1
1316; GFX6-NEXT:    s_endpgm
1317;
1318; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg:
1319; GFX7:       ; %bb.0: ; %entry
1320; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1321; GFX7-NEXT:    s_mov_b32 m0, -1
1322; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1323; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1324; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1325; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1326; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1327; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1328; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1329; GFX7-NEXT:    buffer_wbinvl1_vol
1330; GFX7-NEXT:    s_endpgm
1331;
1332; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
1333; GFX10-WGP:       ; %bb.0: ; %entry
1334; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1335; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1337; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1338; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1339; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1340; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1341; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1342; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1344; GFX10-WGP-NEXT:    buffer_gl0_inv
1345; GFX10-WGP-NEXT:    buffer_gl1_inv
1346; GFX10-WGP-NEXT:    s_endpgm
1347;
1348; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
1349; GFX10-CU:       ; %bb.0: ; %entry
1350; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1351; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1352; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1353; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1354; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1355; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1356; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1357; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1358; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1359; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1360; GFX10-CU-NEXT:    buffer_gl0_inv
1361; GFX10-CU-NEXT:    buffer_gl1_inv
1362; GFX10-CU-NEXT:    s_endpgm
1363;
1364; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg:
1365; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1366; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1367; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1368; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1369; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1370; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1371; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1372; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1373; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1374; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1375; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1376; SKIP-CACHE-INV-NEXT:    s_endpgm
1377    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1378entry:
1379  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1380  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic
1381  ret void
1382}
1383
1384define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
1385; GFX6-LABEL: local_system_seq_cst_monotonic_cmpxchg:
1386; GFX6:       ; %bb.0: ; %entry
1387; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1388; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1389; GFX6-NEXT:    s_mov_b32 m0, -1
1390; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1391; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1392; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1393; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1394; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1395; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1396; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1397; GFX6-NEXT:    buffer_wbinvl1
1398; GFX6-NEXT:    s_endpgm
1399;
1400; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg:
1401; GFX7:       ; %bb.0: ; %entry
1402; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1403; GFX7-NEXT:    s_mov_b32 m0, -1
1404; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1406; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1407; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1408; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1409; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1410; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1411; GFX7-NEXT:    buffer_wbinvl1_vol
1412; GFX7-NEXT:    s_endpgm
1413;
1414; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
1415; GFX10-WGP:       ; %bb.0: ; %entry
1416; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1417; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1419; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1420; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1421; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1422; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1423; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1424; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1425; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1426; GFX10-WGP-NEXT:    buffer_gl0_inv
1427; GFX10-WGP-NEXT:    buffer_gl1_inv
1428; GFX10-WGP-NEXT:    s_endpgm
1429;
1430; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
1431; GFX10-CU:       ; %bb.0: ; %entry
1432; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1433; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1434; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1436; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1437; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1438; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1439; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1440; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1442; GFX10-CU-NEXT:    buffer_gl0_inv
1443; GFX10-CU-NEXT:    buffer_gl1_inv
1444; GFX10-CU-NEXT:    s_endpgm
1445;
1446; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg:
1447; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1448; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1449; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1450; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1451; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1452; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1454; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1455; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1456; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1457; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1458; SKIP-CACHE-INV-NEXT:    s_endpgm
1459    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1460entry:
1461  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1462  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic
1463  ret void
1464}
1465
1466define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
1467; GFX6-LABEL: local_system_acquire_acquire_cmpxchg:
1468; GFX6:       ; %bb.0: ; %entry
1469; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1470; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1471; GFX6-NEXT:    s_mov_b32 m0, -1
1472; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1474; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1475; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1476; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1477; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1478; GFX6-NEXT:    buffer_wbinvl1
1479; GFX6-NEXT:    s_endpgm
1480;
1481; GFX7-LABEL: local_system_acquire_acquire_cmpxchg:
1482; GFX7:       ; %bb.0: ; %entry
1483; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1484; GFX7-NEXT:    s_mov_b32 m0, -1
1485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1487; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1488; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1489; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1490; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1491; GFX7-NEXT:    buffer_wbinvl1_vol
1492; GFX7-NEXT:    s_endpgm
1493;
1494; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
1495; GFX10-WGP:       ; %bb.0: ; %entry
1496; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1497; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1499; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1500; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1501; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1502; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1504; GFX10-WGP-NEXT:    buffer_gl0_inv
1505; GFX10-WGP-NEXT:    buffer_gl1_inv
1506; GFX10-WGP-NEXT:    s_endpgm
1507;
1508; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg:
1509; GFX10-CU:       ; %bb.0: ; %entry
1510; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1511; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1512; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1513; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1514; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1515; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1516; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1518; GFX10-CU-NEXT:    buffer_gl0_inv
1519; GFX10-CU-NEXT:    buffer_gl1_inv
1520; GFX10-CU-NEXT:    s_endpgm
1521;
1522; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg:
1523; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1524; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1525; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1526; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1527; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1529; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1530; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1531; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1532; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1533; SKIP-CACHE-INV-NEXT:    s_endpgm
1534    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1535entry:
1536  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1537  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire
1538  ret void
1539}
1540
1541define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
1542; GFX6-LABEL: local_system_release_acquire_cmpxchg:
1543; GFX6:       ; %bb.0: ; %entry
1544; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1545; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1546; GFX6-NEXT:    s_mov_b32 m0, -1
1547; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1549; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1550; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1551; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1552; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1553; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1554; GFX6-NEXT:    buffer_wbinvl1
1555; GFX6-NEXT:    s_endpgm
1556;
1557; GFX7-LABEL: local_system_release_acquire_cmpxchg:
1558; GFX7:       ; %bb.0: ; %entry
1559; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1560; GFX7-NEXT:    s_mov_b32 m0, -1
1561; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1562; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1563; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1564; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1565; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1566; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1567; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1568; GFX7-NEXT:    buffer_wbinvl1_vol
1569; GFX7-NEXT:    s_endpgm
1570;
1571; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg:
1572; GFX10-WGP:       ; %bb.0: ; %entry
1573; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1574; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1576; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1577; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1578; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1579; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1580; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1581; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1582; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1583; GFX10-WGP-NEXT:    buffer_gl0_inv
1584; GFX10-WGP-NEXT:    buffer_gl1_inv
1585; GFX10-WGP-NEXT:    s_endpgm
1586;
1587; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg:
1588; GFX10-CU:       ; %bb.0: ; %entry
1589; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1590; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1592; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1593; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1594; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1595; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1596; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1597; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1599; GFX10-CU-NEXT:    buffer_gl0_inv
1600; GFX10-CU-NEXT:    buffer_gl1_inv
1601; GFX10-CU-NEXT:    s_endpgm
1602;
1603; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg:
1604; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1605; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1606; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1607; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1608; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1609; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1610; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1611; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1612; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1613; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1614; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1615; SKIP-CACHE-INV-NEXT:    s_endpgm
1616    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1617entry:
1618  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1619  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire
1620  ret void
1621}
1622
1623define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
1624; GFX6-LABEL: local_system_acq_rel_acquire_cmpxchg:
1625; GFX6:       ; %bb.0: ; %entry
1626; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1627; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1628; GFX6-NEXT:    s_mov_b32 m0, -1
1629; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1630; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1631; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1632; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1633; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1634; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1635; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1636; GFX6-NEXT:    buffer_wbinvl1
1637; GFX6-NEXT:    s_endpgm
1638;
1639; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg:
1640; GFX7:       ; %bb.0: ; %entry
1641; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1642; GFX7-NEXT:    s_mov_b32 m0, -1
1643; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1644; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1645; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1646; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1647; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1648; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1649; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1650; GFX7-NEXT:    buffer_wbinvl1_vol
1651; GFX7-NEXT:    s_endpgm
1652;
1653; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
1654; GFX10-WGP:       ; %bb.0: ; %entry
1655; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1656; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1658; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1659; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1660; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1661; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1662; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1663; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1664; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1665; GFX10-WGP-NEXT:    buffer_gl0_inv
1666; GFX10-WGP-NEXT:    buffer_gl1_inv
1667; GFX10-WGP-NEXT:    s_endpgm
1668;
1669; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
1670; GFX10-CU:       ; %bb.0: ; %entry
1671; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1672; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1673; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1674; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1675; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1676; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1677; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1678; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1679; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1680; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1681; GFX10-CU-NEXT:    buffer_gl0_inv
1682; GFX10-CU-NEXT:    buffer_gl1_inv
1683; GFX10-CU-NEXT:    s_endpgm
1684;
1685; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg:
1686; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1687; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1688; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1689; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1690; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1691; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1692; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1693; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1694; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1695; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1696; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1697; SKIP-CACHE-INV-NEXT:    s_endpgm
1698    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1699entry:
1700  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1701  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire
1702  ret void
1703}
1704
1705define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
1706; GFX6-LABEL: local_system_seq_cst_acquire_cmpxchg:
1707; GFX6:       ; %bb.0: ; %entry
1708; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1709; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1710; GFX6-NEXT:    s_mov_b32 m0, -1
1711; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1712; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1713; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1714; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1715; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1716; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1717; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1718; GFX6-NEXT:    buffer_wbinvl1
1719; GFX6-NEXT:    s_endpgm
1720;
1721; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg:
1722; GFX7:       ; %bb.0: ; %entry
1723; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1724; GFX7-NEXT:    s_mov_b32 m0, -1
1725; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1727; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1728; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1729; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1730; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1731; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1732; GFX7-NEXT:    buffer_wbinvl1_vol
1733; GFX7-NEXT:    s_endpgm
1734;
1735; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
1736; GFX10-WGP:       ; %bb.0: ; %entry
1737; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1738; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1739; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1740; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1741; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1742; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1743; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1744; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1745; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1747; GFX10-WGP-NEXT:    buffer_gl0_inv
1748; GFX10-WGP-NEXT:    buffer_gl1_inv
1749; GFX10-WGP-NEXT:    s_endpgm
1750;
1751; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
1752; GFX10-CU:       ; %bb.0: ; %entry
1753; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1754; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1756; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1757; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1758; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1759; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1760; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1761; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1762; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1763; GFX10-CU-NEXT:    buffer_gl0_inv
1764; GFX10-CU-NEXT:    buffer_gl1_inv
1765; GFX10-CU-NEXT:    s_endpgm
1766;
1767; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg:
1768; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1769; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1770; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1771; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1772; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1775; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1776; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1777; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1778; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1779; SKIP-CACHE-INV-NEXT:    s_endpgm
1780    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1781entry:
1782  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1783  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire
1784  ret void
1785}
1786
1787define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
1788; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
1789; GFX6:       ; %bb.0: ; %entry
1790; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1791; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1792; GFX6-NEXT:    s_mov_b32 m0, -1
1793; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1794; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1795; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1796; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1797; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1798; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1799; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1800; GFX6-NEXT:    buffer_wbinvl1
1801; GFX6-NEXT:    s_endpgm
1802;
1803; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
1804; GFX7:       ; %bb.0: ; %entry
1805; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1806; GFX7-NEXT:    s_mov_b32 m0, -1
1807; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1808; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1809; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1810; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1811; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1812; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1813; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1814; GFX7-NEXT:    buffer_wbinvl1_vol
1815; GFX7-NEXT:    s_endpgm
1816;
1817; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
1818; GFX10-WGP:       ; %bb.0: ; %entry
1819; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1820; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1822; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1823; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1824; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1825; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1826; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1827; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1828; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1829; GFX10-WGP-NEXT:    buffer_gl0_inv
1830; GFX10-WGP-NEXT:    buffer_gl1_inv
1831; GFX10-WGP-NEXT:    s_endpgm
1832;
1833; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
1834; GFX10-CU:       ; %bb.0: ; %entry
1835; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1836; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1837; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1838; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1839; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1840; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1841; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1842; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1843; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1845; GFX10-CU-NEXT:    buffer_gl0_inv
1846; GFX10-CU-NEXT:    buffer_gl1_inv
1847; GFX10-CU-NEXT:    s_endpgm
1848;
1849; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
1850; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1851; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1852; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1853; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1854; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1855; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1856; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1857; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1858; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1859; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1860; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1861; SKIP-CACHE-INV-NEXT:    s_endpgm
1862    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1863entry:
1864  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1865  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst
1866  ret void
1867}
1868
1869define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
1870; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
1871; GFX6:       ; %bb.0: ; %entry
1872; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1873; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1874; GFX6-NEXT:    s_mov_b32 m0, -1
1875; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1876; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1877; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1878; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1879; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1880; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1881; GFX6-NEXT:    buffer_wbinvl1
1882; GFX6-NEXT:    ds_write_b32 v0, v1
1883; GFX6-NEXT:    s_endpgm
1884;
1885; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
1886; GFX7:       ; %bb.0: ; %entry
1887; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1888; GFX7-NEXT:    s_mov_b32 m0, -1
1889; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1890; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1891; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1892; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1893; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1894; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1895; GFX7-NEXT:    buffer_wbinvl1_vol
1896; GFX7-NEXT:    ds_write_b32 v0, v1
1897; GFX7-NEXT:    s_endpgm
1898;
1899; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
1900; GFX10-WGP:       ; %bb.0: ; %entry
1901; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1902; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1903; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1904; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1905; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1906; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1907; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1908; GFX10-WGP-NEXT:    buffer_gl0_inv
1909; GFX10-WGP-NEXT:    buffer_gl1_inv
1910; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1911; GFX10-WGP-NEXT:    s_endpgm
1912;
1913; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
1914; GFX10-CU:       ; %bb.0: ; %entry
1915; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1916; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1917; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1918; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1919; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1920; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1921; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1922; GFX10-CU-NEXT:    buffer_gl0_inv
1923; GFX10-CU-NEXT:    buffer_gl1_inv
1924; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1925; GFX10-CU-NEXT:    s_endpgm
1926;
1927; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
1928; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1929; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1930; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1931; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1932; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1934; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1935; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1936; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1937; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1938; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1939; SKIP-CACHE-INV-NEXT:    s_endpgm
1940    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1941entry:
1942  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1943  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic
1944  %val0 = extractvalue { i32, i1 } %val, 0
1945  store i32 %val0, i32 addrspace(3)* %out, align 4
1946  ret void
1947}
1948
1949define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
1950; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
1951; GFX6:       ; %bb.0: ; %entry
1952; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1953; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1954; GFX6-NEXT:    s_mov_b32 m0, -1
1955; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1957; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1958; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1959; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1960; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1961; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1962; GFX6-NEXT:    buffer_wbinvl1
1963; GFX6-NEXT:    ds_write_b32 v0, v1
1964; GFX6-NEXT:    s_endpgm
1965;
1966; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
1967; GFX7:       ; %bb.0: ; %entry
1968; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1969; GFX7-NEXT:    s_mov_b32 m0, -1
1970; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1972; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1973; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1974; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1975; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1976; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1977; GFX7-NEXT:    buffer_wbinvl1_vol
1978; GFX7-NEXT:    ds_write_b32 v0, v1
1979; GFX7-NEXT:    s_endpgm
1980;
1981; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
1982; GFX10-WGP:       ; %bb.0: ; %entry
1983; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1984; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1986; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1987; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1988; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1989; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1990; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
1991; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1992; GFX10-WGP-NEXT:    buffer_gl0_inv
1993; GFX10-WGP-NEXT:    buffer_gl1_inv
1994; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1995; GFX10-WGP-NEXT:    s_endpgm
1996;
1997; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
1998; GFX10-CU:       ; %bb.0: ; %entry
1999; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2000; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2002; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2003; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2004; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2005; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2006; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2007; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2008; GFX10-CU-NEXT:    buffer_gl0_inv
2009; GFX10-CU-NEXT:    buffer_gl1_inv
2010; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2011; GFX10-CU-NEXT:    s_endpgm
2012;
2013; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
2014; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2015; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2016; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2017; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2018; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2019; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2020; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2021; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2022; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2023; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2024; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2025; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2026; SKIP-CACHE-INV-NEXT:    s_endpgm
2027    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2028entry:
2029  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2030  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic
2031  %val0 = extractvalue { i32, i1 } %val, 0
2032  store i32 %val0, i32 addrspace(3)* %out, align 4
2033  ret void
2034}
2035
2036define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
2037; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
2038; GFX6:       ; %bb.0: ; %entry
2039; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2040; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2041; GFX6-NEXT:    s_mov_b32 m0, -1
2042; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2043; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2044; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2045; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2046; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2047; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2048; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2049; GFX6-NEXT:    buffer_wbinvl1
2050; GFX6-NEXT:    ds_write_b32 v0, v1
2051; GFX6-NEXT:    s_endpgm
2052;
2053; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
2054; GFX7:       ; %bb.0: ; %entry
2055; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2056; GFX7-NEXT:    s_mov_b32 m0, -1
2057; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2059; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2060; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2061; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2062; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2063; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2064; GFX7-NEXT:    buffer_wbinvl1_vol
2065; GFX7-NEXT:    ds_write_b32 v0, v1
2066; GFX7-NEXT:    s_endpgm
2067;
2068; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
2069; GFX10-WGP:       ; %bb.0: ; %entry
2070; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2071; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2073; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2074; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2075; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2076; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2077; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2078; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2079; GFX10-WGP-NEXT:    buffer_gl0_inv
2080; GFX10-WGP-NEXT:    buffer_gl1_inv
2081; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2082; GFX10-WGP-NEXT:    s_endpgm
2083;
2084; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
2085; GFX10-CU:       ; %bb.0: ; %entry
2086; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2087; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2089; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2090; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2091; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2092; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2093; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2094; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2095; GFX10-CU-NEXT:    buffer_gl0_inv
2096; GFX10-CU-NEXT:    buffer_gl1_inv
2097; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2098; GFX10-CU-NEXT:    s_endpgm
2099;
2100; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
2101; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2102; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2103; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2104; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2105; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2106; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2107; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2108; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2109; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2110; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2111; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2112; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2113; SKIP-CACHE-INV-NEXT:    s_endpgm
2114    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2115entry:
2116  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2117  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic
2118  %val0 = extractvalue { i32, i1 } %val, 0
2119  store i32 %val0, i32 addrspace(3)* %out, align 4
2120  ret void
2121}
2122
2123define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
2124; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg:
2125; GFX6:       ; %bb.0: ; %entry
2126; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2127; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2128; GFX6-NEXT:    s_mov_b32 m0, -1
2129; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2130; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2131; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2132; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2133; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2134; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2135; GFX6-NEXT:    buffer_wbinvl1
2136; GFX6-NEXT:    ds_write_b32 v0, v1
2137; GFX6-NEXT:    s_endpgm
2138;
2139; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg:
2140; GFX7:       ; %bb.0: ; %entry
2141; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2142; GFX7-NEXT:    s_mov_b32 m0, -1
2143; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2144; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2145; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2146; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2147; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2148; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2149; GFX7-NEXT:    buffer_wbinvl1_vol
2150; GFX7-NEXT:    ds_write_b32 v0, v1
2151; GFX7-NEXT:    s_endpgm
2152;
2153; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
2154; GFX10-WGP:       ; %bb.0: ; %entry
2155; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2156; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2157; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2158; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2159; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2160; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2161; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2162; GFX10-WGP-NEXT:    buffer_gl0_inv
2163; GFX10-WGP-NEXT:    buffer_gl1_inv
2164; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2165; GFX10-WGP-NEXT:    s_endpgm
2166;
2167; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
2168; GFX10-CU:       ; %bb.0: ; %entry
2169; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2170; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2171; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2172; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2173; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2174; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2175; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2176; GFX10-CU-NEXT:    buffer_gl0_inv
2177; GFX10-CU-NEXT:    buffer_gl1_inv
2178; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2179; GFX10-CU-NEXT:    s_endpgm
2180;
2181; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg:
2182; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2183; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2184; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2185; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2186; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2187; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2190; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2191; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2192; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2193; SKIP-CACHE-INV-NEXT:    s_endpgm
2194    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2195entry:
2196  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2197  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire
2198  %val0 = extractvalue { i32, i1 } %val, 0
2199  store i32 %val0, i32 addrspace(3)* %out, align 4
2200  ret void
2201}
2202
2203define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
2204; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg:
2205; GFX6:       ; %bb.0: ; %entry
2206; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2207; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2208; GFX6-NEXT:    s_mov_b32 m0, -1
2209; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2211; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2212; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2213; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2214; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2215; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2216; GFX6-NEXT:    buffer_wbinvl1
2217; GFX6-NEXT:    ds_write_b32 v0, v1
2218; GFX6-NEXT:    s_endpgm
2219;
2220; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg:
2221; GFX7:       ; %bb.0: ; %entry
2222; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2223; GFX7-NEXT:    s_mov_b32 m0, -1
2224; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2225; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2226; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2227; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2228; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2229; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2230; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2231; GFX7-NEXT:    buffer_wbinvl1_vol
2232; GFX7-NEXT:    ds_write_b32 v0, v1
2233; GFX7-NEXT:    s_endpgm
2234;
2235; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
2236; GFX10-WGP:       ; %bb.0: ; %entry
2237; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2238; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2240; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2241; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2242; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2243; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2244; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2245; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2246; GFX10-WGP-NEXT:    buffer_gl0_inv
2247; GFX10-WGP-NEXT:    buffer_gl1_inv
2248; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2249; GFX10-WGP-NEXT:    s_endpgm
2250;
2251; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
2252; GFX10-CU:       ; %bb.0: ; %entry
2253; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2254; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2255; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2256; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2257; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2258; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2259; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2260; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2261; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2262; GFX10-CU-NEXT:    buffer_gl0_inv
2263; GFX10-CU-NEXT:    buffer_gl1_inv
2264; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2265; GFX10-CU-NEXT:    s_endpgm
2266;
2267; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg:
2268; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2269; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2270; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2271; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2272; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2273; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2274; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2275; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2276; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2277; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2278; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2279; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2280; SKIP-CACHE-INV-NEXT:    s_endpgm
2281    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2282entry:
2283  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2284  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire
2285  %val0 = extractvalue { i32, i1 } %val, 0
2286  store i32 %val0, i32 addrspace(3)* %out, align 4
2287  ret void
2288}
2289
2290define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
2291; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
2292; GFX6:       ; %bb.0: ; %entry
2293; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2294; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2295; GFX6-NEXT:    s_mov_b32 m0, -1
2296; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2297; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2298; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2299; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2300; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2301; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2302; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2303; GFX6-NEXT:    buffer_wbinvl1
2304; GFX6-NEXT:    ds_write_b32 v0, v1
2305; GFX6-NEXT:    s_endpgm
2306;
2307; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
2308; GFX7:       ; %bb.0: ; %entry
2309; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2310; GFX7-NEXT:    s_mov_b32 m0, -1
2311; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2313; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2314; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2315; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2316; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2317; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2318; GFX7-NEXT:    buffer_wbinvl1_vol
2319; GFX7-NEXT:    ds_write_b32 v0, v1
2320; GFX7-NEXT:    s_endpgm
2321;
2322; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
2323; GFX10-WGP:       ; %bb.0: ; %entry
2324; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2325; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2327; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2328; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2329; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2330; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2331; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2332; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2333; GFX10-WGP-NEXT:    buffer_gl0_inv
2334; GFX10-WGP-NEXT:    buffer_gl1_inv
2335; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2336; GFX10-WGP-NEXT:    s_endpgm
2337;
2338; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
2339; GFX10-CU:       ; %bb.0: ; %entry
2340; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2341; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2342; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2343; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2344; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2345; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2346; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2347; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2348; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2349; GFX10-CU-NEXT:    buffer_gl0_inv
2350; GFX10-CU-NEXT:    buffer_gl1_inv
2351; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2352; GFX10-CU-NEXT:    s_endpgm
2353;
2354; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
2355; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2356; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2357; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2358; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2359; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2361; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2362; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2363; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2364; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2365; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2366; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2367; SKIP-CACHE-INV-NEXT:    s_endpgm
2368    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2369entry:
2370  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2371  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire
2372  %val0 = extractvalue { i32, i1 } %val, 0
2373  store i32 %val0, i32 addrspace(3)* %out, align 4
2374  ret void
2375}
2376
2377define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
2378; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
2379; GFX6:       ; %bb.0: ; %entry
2380; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2381; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2382; GFX6-NEXT:    s_mov_b32 m0, -1
2383; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2384; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2385; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2386; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2387; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2388; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2389; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2390; GFX6-NEXT:    buffer_wbinvl1
2391; GFX6-NEXT:    ds_write_b32 v0, v1
2392; GFX6-NEXT:    s_endpgm
2393;
2394; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
2395; GFX7:       ; %bb.0: ; %entry
2396; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2397; GFX7-NEXT:    s_mov_b32 m0, -1
2398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2399; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2400; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2401; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2402; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2403; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2404; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2405; GFX7-NEXT:    buffer_wbinvl1_vol
2406; GFX7-NEXT:    ds_write_b32 v0, v1
2407; GFX7-NEXT:    s_endpgm
2408;
2409; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
2410; GFX10-WGP:       ; %bb.0: ; %entry
2411; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2412; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2413; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2414; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2415; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2416; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2417; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2418; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2419; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2420; GFX10-WGP-NEXT:    buffer_gl0_inv
2421; GFX10-WGP-NEXT:    buffer_gl1_inv
2422; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2423; GFX10-WGP-NEXT:    s_endpgm
2424;
2425; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
2426; GFX10-CU:       ; %bb.0: ; %entry
2427; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2428; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2429; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2430; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2431; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2432; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2433; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2434; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2435; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2436; GFX10-CU-NEXT:    buffer_gl0_inv
2437; GFX10-CU-NEXT:    buffer_gl1_inv
2438; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2439; GFX10-CU-NEXT:    s_endpgm
2440;
2441; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
2442; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2444; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2445; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2446; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2450; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2451; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2452; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2453; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2454; SKIP-CACHE-INV-NEXT:    s_endpgm
2455    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2456entry:
2457  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2458  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire
2459  %val0 = extractvalue { i32, i1 } %val, 0
2460  store i32 %val0, i32 addrspace(3)* %out, align 4
2461  ret void
2462}
2463
2464define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
2465; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
2466; GFX6:       ; %bb.0: ; %entry
2467; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2468; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2469; GFX6-NEXT:    s_mov_b32 m0, -1
2470; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2471; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2472; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2473; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2474; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2475; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2476; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2477; GFX6-NEXT:    buffer_wbinvl1
2478; GFX6-NEXT:    ds_write_b32 v0, v1
2479; GFX6-NEXT:    s_endpgm
2480;
2481; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
2482; GFX7:       ; %bb.0: ; %entry
2483; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2484; GFX7-NEXT:    s_mov_b32 m0, -1
2485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2486; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2487; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2488; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2489; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2490; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2491; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2492; GFX7-NEXT:    buffer_wbinvl1_vol
2493; GFX7-NEXT:    ds_write_b32 v0, v1
2494; GFX7-NEXT:    s_endpgm
2495;
2496; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
2497; GFX10-WGP:       ; %bb.0: ; %entry
2498; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2499; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2500; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2501; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2502; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2503; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2504; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2505; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2506; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2507; GFX10-WGP-NEXT:    buffer_gl0_inv
2508; GFX10-WGP-NEXT:    buffer_gl1_inv
2509; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2510; GFX10-WGP-NEXT:    s_endpgm
2511;
2512; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
2513; GFX10-CU:       ; %bb.0: ; %entry
2514; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2515; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2516; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2517; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2518; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2519; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2520; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2521; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2522; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2523; GFX10-CU-NEXT:    buffer_gl0_inv
2524; GFX10-CU-NEXT:    buffer_gl1_inv
2525; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2526; GFX10-CU-NEXT:    s_endpgm
2527;
2528; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
2529; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2530; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2531; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2532; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2533; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2537; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2538; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2539; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2540; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2541; SKIP-CACHE-INV-NEXT:    s_endpgm
2542    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2543entry:
2544  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2545  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst
2546  %val0 = extractvalue { i32, i1 } %val, 0
2547  store i32 %val0, i32 addrspace(3)* %out, align 4
2548  ret void
2549}
2550
2551define amdgpu_kernel void @local_system_one_as_unordered_load(
2552; GFX6-LABEL: local_system_one_as_unordered_load:
2553; GFX6:       ; %bb.0: ; %entry
2554; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2555; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2556; GFX6-NEXT:    s_mov_b32 m0, -1
2557; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2558; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2559; GFX6-NEXT:    ds_read_b32 v0, v0
2560; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2561; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX6-NEXT:    ds_write_b32 v1, v0
2563; GFX6-NEXT:    s_endpgm
2564;
2565; GFX7-LABEL: local_system_one_as_unordered_load:
2566; GFX7:       ; %bb.0: ; %entry
2567; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2568; GFX7-NEXT:    s_mov_b32 m0, -1
2569; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2570; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2571; GFX7-NEXT:    ds_read_b32 v0, v0
2572; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2573; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2574; GFX7-NEXT:    ds_write_b32 v1, v0
2575; GFX7-NEXT:    s_endpgm
2576;
2577; GFX10-WGP-LABEL: local_system_one_as_unordered_load:
2578; GFX10-WGP:       ; %bb.0: ; %entry
2579; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2580; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2581; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2582; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2583; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
2584; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
2586; GFX10-WGP-NEXT:    s_endpgm
2587;
2588; GFX10-CU-LABEL: local_system_one_as_unordered_load:
2589; GFX10-CU:       ; %bb.0: ; %entry
2590; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2591; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2592; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2593; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2594; GFX10-CU-NEXT:    ds_read_b32 v0, v0
2595; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2596; GFX10-CU-NEXT:    ds_write_b32 v1, v0
2597; GFX10-CU-NEXT:    s_endpgm
2598;
2599; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load:
2600; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2601; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2602; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2603; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2604; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2605; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
2606; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2607; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2608; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
2609; SKIP-CACHE-INV-NEXT:    s_endpgm
2610    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
2611entry:
2612  %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4
2613  store i32 %val, i32 addrspace(3)* %out
2614  ret void
2615}
2616
2617define amdgpu_kernel void @local_system_one_as_monotonic_load(
2618; GFX6-LABEL: local_system_one_as_monotonic_load:
2619; GFX6:       ; %bb.0: ; %entry
2620; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2621; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2622; GFX6-NEXT:    s_mov_b32 m0, -1
2623; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2624; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2625; GFX6-NEXT:    ds_read_b32 v0, v0
2626; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2627; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX6-NEXT:    ds_write_b32 v1, v0
2629; GFX6-NEXT:    s_endpgm
2630;
2631; GFX7-LABEL: local_system_one_as_monotonic_load:
2632; GFX7:       ; %bb.0: ; %entry
2633; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2634; GFX7-NEXT:    s_mov_b32 m0, -1
2635; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2636; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2637; GFX7-NEXT:    ds_read_b32 v0, v0
2638; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2639; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2640; GFX7-NEXT:    ds_write_b32 v1, v0
2641; GFX7-NEXT:    s_endpgm
2642;
2643; GFX10-WGP-LABEL: local_system_one_as_monotonic_load:
2644; GFX10-WGP:       ; %bb.0: ; %entry
2645; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2646; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2647; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2648; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2649; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
2650; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2651; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
2652; GFX10-WGP-NEXT:    s_endpgm
2653;
2654; GFX10-CU-LABEL: local_system_one_as_monotonic_load:
2655; GFX10-CU:       ; %bb.0: ; %entry
2656; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2657; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2659; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2660; GFX10-CU-NEXT:    ds_read_b32 v0, v0
2661; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2662; GFX10-CU-NEXT:    ds_write_b32 v1, v0
2663; GFX10-CU-NEXT:    s_endpgm
2664;
2665; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load:
2666; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2667; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2668; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2669; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2670; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2671; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
2672; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2673; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2674; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
2675; SKIP-CACHE-INV-NEXT:    s_endpgm
2676    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
2677entry:
2678  %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4
2679  store i32 %val, i32 addrspace(3)* %out
2680  ret void
2681}
2682
2683define amdgpu_kernel void @local_system_one_as_acquire_load(
2684; GFX6-LABEL: local_system_one_as_acquire_load:
2685; GFX6:       ; %bb.0: ; %entry
2686; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2687; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2688; GFX6-NEXT:    s_mov_b32 m0, -1
2689; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2691; GFX6-NEXT:    ds_read_b32 v0, v0
2692; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2693; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2694; GFX6-NEXT:    ds_write_b32 v1, v0
2695; GFX6-NEXT:    s_endpgm
2696;
2697; GFX7-LABEL: local_system_one_as_acquire_load:
2698; GFX7:       ; %bb.0: ; %entry
2699; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2700; GFX7-NEXT:    s_mov_b32 m0, -1
2701; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2702; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2703; GFX7-NEXT:    ds_read_b32 v0, v0
2704; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2705; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2706; GFX7-NEXT:    ds_write_b32 v1, v0
2707; GFX7-NEXT:    s_endpgm
2708;
2709; GFX10-WGP-LABEL: local_system_one_as_acquire_load:
2710; GFX10-WGP:       ; %bb.0: ; %entry
2711; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2712; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2713; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2714; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2715; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
2716; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2717; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
2718; GFX10-WGP-NEXT:    s_endpgm
2719;
2720; GFX10-CU-LABEL: local_system_one_as_acquire_load:
2721; GFX10-CU:       ; %bb.0: ; %entry
2722; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2723; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2724; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2725; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2726; GFX10-CU-NEXT:    ds_read_b32 v0, v0
2727; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2728; GFX10-CU-NEXT:    ds_write_b32 v1, v0
2729; GFX10-CU-NEXT:    s_endpgm
2730;
2731; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load:
2732; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2733; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2734; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2735; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2737; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
2738; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2739; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2740; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
2741; SKIP-CACHE-INV-NEXT:    s_endpgm
2742    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
2743entry:
2744  %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4
2745  store i32 %val, i32 addrspace(3)* %out
2746  ret void
2747}
2748
2749define amdgpu_kernel void @local_system_one_as_seq_cst_load(
2750; GFX6-LABEL: local_system_one_as_seq_cst_load:
2751; GFX6:       ; %bb.0: ; %entry
2752; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2753; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2754; GFX6-NEXT:    s_mov_b32 m0, -1
2755; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2756; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2757; GFX6-NEXT:    ds_read_b32 v0, v0
2758; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2759; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2760; GFX6-NEXT:    ds_write_b32 v1, v0
2761; GFX6-NEXT:    s_endpgm
2762;
2763; GFX7-LABEL: local_system_one_as_seq_cst_load:
2764; GFX7:       ; %bb.0: ; %entry
2765; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2766; GFX7-NEXT:    s_mov_b32 m0, -1
2767; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2768; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2769; GFX7-NEXT:    ds_read_b32 v0, v0
2770; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2771; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2772; GFX7-NEXT:    ds_write_b32 v1, v0
2773; GFX7-NEXT:    s_endpgm
2774;
2775; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load:
2776; GFX10-WGP:       ; %bb.0: ; %entry
2777; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2778; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2779; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2780; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2781; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
2782; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2783; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
2784; GFX10-WGP-NEXT:    s_endpgm
2785;
2786; GFX10-CU-LABEL: local_system_one_as_seq_cst_load:
2787; GFX10-CU:       ; %bb.0: ; %entry
2788; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2789; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2790; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2791; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2792; GFX10-CU-NEXT:    ds_read_b32 v0, v0
2793; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2794; GFX10-CU-NEXT:    ds_write_b32 v1, v0
2795; GFX10-CU-NEXT:    s_endpgm
2796;
2797; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load:
2798; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2799; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2800; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2801; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2803; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
2804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2805; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2806; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
2807; SKIP-CACHE-INV-NEXT:    s_endpgm
2808    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
2809entry:
2810  %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4
2811  store i32 %val, i32 addrspace(3)* %out
2812  ret void
2813}
2814
2815define amdgpu_kernel void @local_system_one_as_unordered_store(
2816; GFX6-LABEL: local_system_one_as_unordered_store:
2817; GFX6:       ; %bb.0: ; %entry
2818; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2819; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2820; GFX6-NEXT:    s_mov_b32 m0, -1
2821; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2822; GFX6-NEXT:    v_mov_b32_e32 v1, s0
2823; GFX6-NEXT:    v_mov_b32_e32 v0, s1
2824; GFX6-NEXT:    ds_write_b32 v0, v1
2825; GFX6-NEXT:    s_endpgm
2826;
2827; GFX7-LABEL: local_system_one_as_unordered_store:
2828; GFX7:       ; %bb.0: ; %entry
2829; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2830; GFX7-NEXT:    s_mov_b32 m0, -1
2831; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2832; GFX7-NEXT:    v_mov_b32_e32 v0, s1
2833; GFX7-NEXT:    v_mov_b32_e32 v1, s0
2834; GFX7-NEXT:    ds_write_b32 v0, v1
2835; GFX7-NEXT:    s_endpgm
2836;
2837; GFX10-WGP-LABEL: local_system_one_as_unordered_store:
2838; GFX10-WGP:       ; %bb.0: ; %entry
2839; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2840; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2841; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
2842; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
2843; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2844; GFX10-WGP-NEXT:    s_endpgm
2845;
2846; GFX10-CU-LABEL: local_system_one_as_unordered_store:
2847; GFX10-CU:       ; %bb.0: ; %entry
2848; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2849; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2850; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
2851; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
2852; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2853; GFX10-CU-NEXT:    s_endpgm
2854;
2855; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store:
2856; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2857; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2858; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2859; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
2861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2862; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2863; SKIP-CACHE-INV-NEXT:    s_endpgm
2864    i32 %in, i32 addrspace(3)* %out) {
2865entry:
2866  store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4
2867  ret void
2868}
2869
2870define amdgpu_kernel void @local_system_one_as_monotonic_store(
2871; GFX6-LABEL: local_system_one_as_monotonic_store:
2872; GFX6:       ; %bb.0: ; %entry
2873; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2874; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2875; GFX6-NEXT:    s_mov_b32 m0, -1
2876; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2877; GFX6-NEXT:    v_mov_b32_e32 v1, s0
2878; GFX6-NEXT:    v_mov_b32_e32 v0, s1
2879; GFX6-NEXT:    ds_write_b32 v0, v1
2880; GFX6-NEXT:    s_endpgm
2881;
2882; GFX7-LABEL: local_system_one_as_monotonic_store:
2883; GFX7:       ; %bb.0: ; %entry
2884; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2885; GFX7-NEXT:    s_mov_b32 m0, -1
2886; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2887; GFX7-NEXT:    v_mov_b32_e32 v0, s1
2888; GFX7-NEXT:    v_mov_b32_e32 v1, s0
2889; GFX7-NEXT:    ds_write_b32 v0, v1
2890; GFX7-NEXT:    s_endpgm
2891;
2892; GFX10-WGP-LABEL: local_system_one_as_monotonic_store:
2893; GFX10-WGP:       ; %bb.0: ; %entry
2894; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2895; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2896; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
2897; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
2898; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2899; GFX10-WGP-NEXT:    s_endpgm
2900;
2901; GFX10-CU-LABEL: local_system_one_as_monotonic_store:
2902; GFX10-CU:       ; %bb.0: ; %entry
2903; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2904; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2905; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
2906; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
2907; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2908; GFX10-CU-NEXT:    s_endpgm
2909;
2910; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store:
2911; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2912; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2913; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2914; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
2916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2917; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2918; SKIP-CACHE-INV-NEXT:    s_endpgm
2919    i32 %in, i32 addrspace(3)* %out) {
2920entry:
2921  store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4
2922  ret void
2923}
2924
2925define amdgpu_kernel void @local_system_one_as_release_store(
2926; GFX6-LABEL: local_system_one_as_release_store:
2927; GFX6:       ; %bb.0: ; %entry
2928; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2929; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2930; GFX6-NEXT:    s_mov_b32 m0, -1
2931; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2932; GFX6-NEXT:    v_mov_b32_e32 v1, s0
2933; GFX6-NEXT:    v_mov_b32_e32 v0, s1
2934; GFX6-NEXT:    ds_write_b32 v0, v1
2935; GFX6-NEXT:    s_endpgm
2936;
2937; GFX7-LABEL: local_system_one_as_release_store:
2938; GFX7:       ; %bb.0: ; %entry
2939; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2940; GFX7-NEXT:    s_mov_b32 m0, -1
2941; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX7-NEXT:    v_mov_b32_e32 v0, s1
2943; GFX7-NEXT:    v_mov_b32_e32 v1, s0
2944; GFX7-NEXT:    ds_write_b32 v0, v1
2945; GFX7-NEXT:    s_endpgm
2946;
2947; GFX10-WGP-LABEL: local_system_one_as_release_store:
2948; GFX10-WGP:       ; %bb.0: ; %entry
2949; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2950; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2951; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
2952; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
2953; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2954; GFX10-WGP-NEXT:    s_endpgm
2955;
2956; GFX10-CU-LABEL: local_system_one_as_release_store:
2957; GFX10-CU:       ; %bb.0: ; %entry
2958; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2959; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2960; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
2961; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
2962; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2963; GFX10-CU-NEXT:    s_endpgm
2964;
2965; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store:
2966; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2967; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2968; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2969; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2970; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
2971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2972; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2973; SKIP-CACHE-INV-NEXT:    s_endpgm
2974    i32 %in, i32 addrspace(3)* %out) {
2975entry:
2976  store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4
2977  ret void
2978}
2979
2980define amdgpu_kernel void @local_system_one_as_seq_cst_store(
2981; GFX6-LABEL: local_system_one_as_seq_cst_store:
2982; GFX6:       ; %bb.0: ; %entry
2983; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2984; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2985; GFX6-NEXT:    s_mov_b32 m0, -1
2986; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX6-NEXT:    v_mov_b32_e32 v1, s0
2988; GFX6-NEXT:    v_mov_b32_e32 v0, s1
2989; GFX6-NEXT:    ds_write_b32 v0, v1
2990; GFX6-NEXT:    s_endpgm
2991;
2992; GFX7-LABEL: local_system_one_as_seq_cst_store:
2993; GFX7:       ; %bb.0: ; %entry
2994; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2995; GFX7-NEXT:    s_mov_b32 m0, -1
2996; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2997; GFX7-NEXT:    v_mov_b32_e32 v0, s1
2998; GFX7-NEXT:    v_mov_b32_e32 v1, s0
2999; GFX7-NEXT:    ds_write_b32 v0, v1
3000; GFX7-NEXT:    s_endpgm
3001;
3002; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store:
3003; GFX10-WGP:       ; %bb.0: ; %entry
3004; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3005; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3006; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3007; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3008; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3009; GFX10-WGP-NEXT:    s_endpgm
3010;
3011; GFX10-CU-LABEL: local_system_one_as_seq_cst_store:
3012; GFX10-CU:       ; %bb.0: ; %entry
3013; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3014; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3015; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3016; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3017; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3018; GFX10-CU-NEXT:    s_endpgm
3019;
3020; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store:
3021; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3022; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3023; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3024; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3026; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3027; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3028; SKIP-CACHE-INV-NEXT:    s_endpgm
3029    i32 %in, i32 addrspace(3)* %out) {
3030entry:
3031  store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4
3032  ret void
3033}
3034
3035define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
3036; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw:
3037; GFX6:       ; %bb.0: ; %entry
3038; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3039; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3040; GFX6-NEXT:    s_mov_b32 m0, -1
3041; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3042; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3043; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3044; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3045; GFX6-NEXT:    s_endpgm
3046;
3047; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw:
3048; GFX7:       ; %bb.0: ; %entry
3049; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3050; GFX7-NEXT:    s_mov_b32 m0, -1
3051; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3052; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3053; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3054; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3055; GFX7-NEXT:    s_endpgm
3056;
3057; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw:
3058; GFX10-WGP:       ; %bb.0: ; %entry
3059; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3060; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3061; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3062; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3063; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3064; GFX10-WGP-NEXT:    s_endpgm
3065;
3066; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw:
3067; GFX10-CU:       ; %bb.0: ; %entry
3068; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3069; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3070; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3071; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3072; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3073; GFX10-CU-NEXT:    s_endpgm
3074;
3075; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw:
3076; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3077; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3078; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3079; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3080; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3081; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3082; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3083; SKIP-CACHE-INV-NEXT:    s_endpgm
3084    i32 addrspace(3)* %out, i32 %in) {
3085entry:
3086  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic
3087  ret void
3088}
3089
3090define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
3091; GFX6-LABEL: local_system_one_as_acquire_atomicrmw:
3092; GFX6:       ; %bb.0: ; %entry
3093; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3094; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3095; GFX6-NEXT:    s_mov_b32 m0, -1
3096; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3097; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3098; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3099; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3100; GFX6-NEXT:    s_endpgm
3101;
3102; GFX7-LABEL: local_system_one_as_acquire_atomicrmw:
3103; GFX7:       ; %bb.0: ; %entry
3104; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3105; GFX7-NEXT:    s_mov_b32 m0, -1
3106; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3107; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3108; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3109; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3110; GFX7-NEXT:    s_endpgm
3111;
3112; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw:
3113; GFX10-WGP:       ; %bb.0: ; %entry
3114; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3115; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3116; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3117; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3118; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3119; GFX10-WGP-NEXT:    s_endpgm
3120;
3121; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw:
3122; GFX10-CU:       ; %bb.0: ; %entry
3123; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3124; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3125; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3126; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3127; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3128; GFX10-CU-NEXT:    s_endpgm
3129;
3130; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw:
3131; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3132; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3133; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3134; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3135; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3136; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3137; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3138; SKIP-CACHE-INV-NEXT:    s_endpgm
3139    i32 addrspace(3)* %out, i32 %in) {
3140entry:
3141  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
3142  ret void
3143}
3144
3145define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
3146; GFX6-LABEL: local_system_one_as_release_atomicrmw:
3147; GFX6:       ; %bb.0: ; %entry
3148; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3149; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3150; GFX6-NEXT:    s_mov_b32 m0, -1
3151; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3152; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3153; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3154; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3155; GFX6-NEXT:    s_endpgm
3156;
3157; GFX7-LABEL: local_system_one_as_release_atomicrmw:
3158; GFX7:       ; %bb.0: ; %entry
3159; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3160; GFX7-NEXT:    s_mov_b32 m0, -1
3161; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3162; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3163; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3164; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3165; GFX7-NEXT:    s_endpgm
3166;
3167; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw:
3168; GFX10-WGP:       ; %bb.0: ; %entry
3169; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3170; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3172; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3173; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3174; GFX10-WGP-NEXT:    s_endpgm
3175;
3176; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw:
3177; GFX10-CU:       ; %bb.0: ; %entry
3178; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3179; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3180; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3181; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3182; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3183; GFX10-CU-NEXT:    s_endpgm
3184;
3185; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw:
3186; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3187; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3188; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3189; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3191; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3192; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3193; SKIP-CACHE-INV-NEXT:    s_endpgm
3194    i32 addrspace(3)* %out, i32 %in) {
3195entry:
3196  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release
3197  ret void
3198}
3199
3200define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
3201; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw:
3202; GFX6:       ; %bb.0: ; %entry
3203; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3204; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3205; GFX6-NEXT:    s_mov_b32 m0, -1
3206; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3207; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3208; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3209; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3210; GFX6-NEXT:    s_endpgm
3211;
3212; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw:
3213; GFX7:       ; %bb.0: ; %entry
3214; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3215; GFX7-NEXT:    s_mov_b32 m0, -1
3216; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3217; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3218; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3219; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3220; GFX7-NEXT:    s_endpgm
3221;
3222; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw:
3223; GFX10-WGP:       ; %bb.0: ; %entry
3224; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3225; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3226; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3227; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3228; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3229; GFX10-WGP-NEXT:    s_endpgm
3230;
3231; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw:
3232; GFX10-CU:       ; %bb.0: ; %entry
3233; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3234; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3235; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3236; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3237; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3238; GFX10-CU-NEXT:    s_endpgm
3239;
3240; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw:
3241; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3242; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3243; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3244; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3246; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3247; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3248; SKIP-CACHE-INV-NEXT:    s_endpgm
3249    i32 addrspace(3)* %out, i32 %in) {
3250entry:
3251  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
3252  ret void
3253}
3254
3255define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
3256; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw:
3257; GFX6:       ; %bb.0: ; %entry
3258; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3259; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3260; GFX6-NEXT:    s_mov_b32 m0, -1
3261; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3262; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3263; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3264; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3265; GFX6-NEXT:    s_endpgm
3266;
3267; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw:
3268; GFX7:       ; %bb.0: ; %entry
3269; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3270; GFX7-NEXT:    s_mov_b32 m0, -1
3271; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3272; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3273; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3274; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3275; GFX7-NEXT:    s_endpgm
3276;
3277; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw:
3278; GFX10-WGP:       ; %bb.0: ; %entry
3279; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3280; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3281; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3282; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3283; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3284; GFX10-WGP-NEXT:    s_endpgm
3285;
3286; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw:
3287; GFX10-CU:       ; %bb.0: ; %entry
3288; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3289; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3290; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3291; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3292; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3293; GFX10-CU-NEXT:    s_endpgm
3294;
3295; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw:
3296; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3297; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3298; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3301; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3302; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3303; SKIP-CACHE-INV-NEXT:    s_endpgm
3304    i32 addrspace(3)* %out, i32 %in) {
3305entry:
3306  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
3307  ret void
3308}
3309
3310define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
3311; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw:
3312; GFX6:       ; %bb.0: ; %entry
3313; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3314; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3315; GFX6-NEXT:    s_mov_b32 m0, -1
3316; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3317; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3318; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3319; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3320; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3321; GFX6-NEXT:    ds_write_b32 v0, v1
3322; GFX6-NEXT:    s_endpgm
3323;
3324; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw:
3325; GFX7:       ; %bb.0: ; %entry
3326; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3327; GFX7-NEXT:    s_mov_b32 m0, -1
3328; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3329; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3330; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3331; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3333; GFX7-NEXT:    ds_write_b32 v0, v1
3334; GFX7-NEXT:    s_endpgm
3335;
3336; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw:
3337; GFX10-WGP:       ; %bb.0: ; %entry
3338; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3339; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3340; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3341; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3342; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3343; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3344; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3345; GFX10-WGP-NEXT:    s_endpgm
3346;
3347; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw:
3348; GFX10-CU:       ; %bb.0: ; %entry
3349; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3350; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3351; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3352; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3353; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3354; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3355; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3356; GFX10-CU-NEXT:    s_endpgm
3357;
3358; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw:
3359; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3360; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3361; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3362; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3363; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3364; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3365; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3366; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3367; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3368; SKIP-CACHE-INV-NEXT:    s_endpgm
3369    i32 addrspace(3)* %out, i32 %in) {
3370entry:
3371  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
3372  store i32 %val, i32 addrspace(3)* %out, align 4
3373  ret void
3374}
3375
3376define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
3377; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
3378; GFX6:       ; %bb.0: ; %entry
3379; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3380; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3381; GFX6-NEXT:    s_mov_b32 m0, -1
3382; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3383; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3384; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3385; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3386; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3387; GFX6-NEXT:    ds_write_b32 v0, v1
3388; GFX6-NEXT:    s_endpgm
3389;
3390; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
3391; GFX7:       ; %bb.0: ; %entry
3392; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3393; GFX7-NEXT:    s_mov_b32 m0, -1
3394; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3395; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3396; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3397; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3399; GFX7-NEXT:    ds_write_b32 v0, v1
3400; GFX7-NEXT:    s_endpgm
3401;
3402; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
3403; GFX10-WGP:       ; %bb.0: ; %entry
3404; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3405; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3406; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3407; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3408; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3409; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3410; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3411; GFX10-WGP-NEXT:    s_endpgm
3412;
3413; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
3414; GFX10-CU:       ; %bb.0: ; %entry
3415; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3416; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3418; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3419; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3420; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3421; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3422; GFX10-CU-NEXT:    s_endpgm
3423;
3424; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
3425; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3426; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3427; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3428; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3429; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3431; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3432; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3433; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3434; SKIP-CACHE-INV-NEXT:    s_endpgm
3435    i32 addrspace(3)* %out, i32 %in) {
3436entry:
3437  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
3438  store i32 %val, i32 addrspace(3)* %out, align 4
3439  ret void
3440}
3441
3442define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
3443; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
3444; GFX6:       ; %bb.0: ; %entry
3445; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3446; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3447; GFX6-NEXT:    s_mov_b32 m0, -1
3448; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3449; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3450; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3451; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3452; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3453; GFX6-NEXT:    ds_write_b32 v0, v1
3454; GFX6-NEXT:    s_endpgm
3455;
3456; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
3457; GFX7:       ; %bb.0: ; %entry
3458; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3459; GFX7-NEXT:    s_mov_b32 m0, -1
3460; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3461; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3462; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3463; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3464; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3465; GFX7-NEXT:    ds_write_b32 v0, v1
3466; GFX7-NEXT:    s_endpgm
3467;
3468; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
3469; GFX10-WGP:       ; %bb.0: ; %entry
3470; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3471; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3472; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3473; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3474; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3475; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3476; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3477; GFX10-WGP-NEXT:    s_endpgm
3478;
3479; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
3480; GFX10-CU:       ; %bb.0: ; %entry
3481; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3482; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3483; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3484; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3485; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3486; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3487; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3488; GFX10-CU-NEXT:    s_endpgm
3489;
3490; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
3491; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3492; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3493; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3494; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3497; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
3498; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3499; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3500; SKIP-CACHE-INV-NEXT:    s_endpgm
3501    i32 addrspace(3)* %out, i32 %in) {
3502entry:
3503  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
3504  store i32 %val, i32 addrspace(3)* %out, align 4
3505  ret void
3506}
3507
3508define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
3509; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
3510; GFX6:       ; %bb.0: ; %entry
3511; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3512; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3513; GFX6-NEXT:    s_mov_b32 m0, -1
3514; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3515; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3516; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3517; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3518; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3519; GFX6-NEXT:    s_endpgm
3520;
3521; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
3522; GFX7:       ; %bb.0: ; %entry
3523; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3524; GFX7-NEXT:    s_mov_b32 m0, -1
3525; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3526; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3527; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3528; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3529; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3530; GFX7-NEXT:    s_endpgm
3531;
3532; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
3533; GFX10-WGP:       ; %bb.0: ; %entry
3534; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3535; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3536; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3537; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3538; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3539; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3540; GFX10-WGP-NEXT:    s_endpgm
3541;
3542; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
3543; GFX10-CU:       ; %bb.0: ; %entry
3544; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3545; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3546; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3547; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3548; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3549; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3550; GFX10-CU-NEXT:    s_endpgm
3551;
3552; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
3553; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3554; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3555; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3556; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3557; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3558; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3561; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3562; SKIP-CACHE-INV-NEXT:    s_endpgm
3563    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3564entry:
3565  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3566  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
3567  ret void
3568}
3569
3570define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
3571; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
3572; GFX6:       ; %bb.0: ; %entry
3573; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3574; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3575; GFX6-NEXT:    s_mov_b32 m0, -1
3576; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3577; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3578; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3579; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3580; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3581; GFX6-NEXT:    s_endpgm
3582;
3583; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
3584; GFX7:       ; %bb.0: ; %entry
3585; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3586; GFX7-NEXT:    s_mov_b32 m0, -1
3587; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3588; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3589; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3590; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3591; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3592; GFX7-NEXT:    s_endpgm
3593;
3594; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
3595; GFX10-WGP:       ; %bb.0: ; %entry
3596; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3597; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3598; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3599; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3600; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3601; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3602; GFX10-WGP-NEXT:    s_endpgm
3603;
3604; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
3605; GFX10-CU:       ; %bb.0: ; %entry
3606; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3607; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3608; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3609; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3610; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3611; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3612; GFX10-CU-NEXT:    s_endpgm
3613;
3614; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
3615; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3616; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3617; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3618; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3619; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3620; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3621; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3622; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3623; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3624; SKIP-CACHE-INV-NEXT:    s_endpgm
3625    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3626entry:
3627  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3628  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
3629  ret void
3630}
3631
3632define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
3633; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg:
3634; GFX6:       ; %bb.0: ; %entry
3635; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3636; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3637; GFX6-NEXT:    s_mov_b32 m0, -1
3638; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3639; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3640; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3641; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3642; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3643; GFX6-NEXT:    s_endpgm
3644;
3645; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg:
3646; GFX7:       ; %bb.0: ; %entry
3647; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3648; GFX7-NEXT:    s_mov_b32 m0, -1
3649; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3650; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3651; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3652; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3653; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3654; GFX7-NEXT:    s_endpgm
3655;
3656; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
3657; GFX10-WGP:       ; %bb.0: ; %entry
3658; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3659; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3660; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3661; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3662; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3663; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3664; GFX10-WGP-NEXT:    s_endpgm
3665;
3666; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
3667; GFX10-CU:       ; %bb.0: ; %entry
3668; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3669; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3670; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3671; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3672; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3673; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3674; GFX10-CU-NEXT:    s_endpgm
3675;
3676; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg:
3677; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3678; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3679; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3680; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3681; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3685; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3686; SKIP-CACHE-INV-NEXT:    s_endpgm
3687    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3688entry:
3689  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3690  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
3691  ret void
3692}
3693
3694define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
3695; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
3696; GFX6:       ; %bb.0: ; %entry
3697; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3698; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3699; GFX6-NEXT:    s_mov_b32 m0, -1
3700; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3701; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3702; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3703; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3704; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3705; GFX6-NEXT:    s_endpgm
3706;
3707; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
3708; GFX7:       ; %bb.0: ; %entry
3709; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3710; GFX7-NEXT:    s_mov_b32 m0, -1
3711; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3712; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3713; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3714; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3715; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3716; GFX7-NEXT:    s_endpgm
3717;
3718; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
3719; GFX10-WGP:       ; %bb.0: ; %entry
3720; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3721; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3722; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3723; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3724; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3725; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3726; GFX10-WGP-NEXT:    s_endpgm
3727;
3728; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
3729; GFX10-CU:       ; %bb.0: ; %entry
3730; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3731; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3732; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3733; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3734; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3735; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3736; GFX10-CU-NEXT:    s_endpgm
3737;
3738; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
3739; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3740; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3741; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3742; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3743; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3744; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3746; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3747; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3748; SKIP-CACHE-INV-NEXT:    s_endpgm
3749    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3750entry:
3751  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3752  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
3753  ret void
3754}
3755
3756define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
3757; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
3758; GFX6:       ; %bb.0: ; %entry
3759; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3760; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3761; GFX6-NEXT:    s_mov_b32 m0, -1
3762; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3763; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3764; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3765; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3766; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3767; GFX6-NEXT:    s_endpgm
3768;
3769; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
3770; GFX7:       ; %bb.0: ; %entry
3771; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3772; GFX7-NEXT:    s_mov_b32 m0, -1
3773; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3774; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3775; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3776; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3777; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3778; GFX7-NEXT:    s_endpgm
3779;
3780; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
3781; GFX10-WGP:       ; %bb.0: ; %entry
3782; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3783; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3784; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3785; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3786; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3787; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3788; GFX10-WGP-NEXT:    s_endpgm
3789;
3790; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
3791; GFX10-CU:       ; %bb.0: ; %entry
3792; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3793; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3794; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3795; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3796; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3797; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3798; GFX10-CU-NEXT:    s_endpgm
3799;
3800; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
3801; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3802; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3803; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3804; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3805; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3806; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3809; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3810; SKIP-CACHE-INV-NEXT:    s_endpgm
3811    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3812entry:
3813  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3814  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
3815  ret void
3816}
3817
3818define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
3819; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
3820; GFX6:       ; %bb.0: ; %entry
3821; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3822; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3823; GFX6-NEXT:    s_mov_b32 m0, -1
3824; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3825; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3826; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3827; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3828; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3829; GFX6-NEXT:    s_endpgm
3830;
3831; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
3832; GFX7:       ; %bb.0: ; %entry
3833; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3834; GFX7-NEXT:    s_mov_b32 m0, -1
3835; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3836; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3837; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3838; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3839; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3840; GFX7-NEXT:    s_endpgm
3841;
3842; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
3843; GFX10-WGP:       ; %bb.0: ; %entry
3844; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3845; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3846; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3847; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3848; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3849; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3850; GFX10-WGP-NEXT:    s_endpgm
3851;
3852; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
3853; GFX10-CU:       ; %bb.0: ; %entry
3854; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3855; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3856; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3857; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3858; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3859; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3860; GFX10-CU-NEXT:    s_endpgm
3861;
3862; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
3863; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3864; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3865; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3866; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3867; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3868; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3870; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3871; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3872; SKIP-CACHE-INV-NEXT:    s_endpgm
3873    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3874entry:
3875  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3876  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
3877  ret void
3878}
3879
3880define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
3881; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg:
3882; GFX6:       ; %bb.0: ; %entry
3883; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3884; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3885; GFX6-NEXT:    s_mov_b32 m0, -1
3886; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3887; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3888; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3889; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3890; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3891; GFX6-NEXT:    s_endpgm
3892;
3893; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg:
3894; GFX7:       ; %bb.0: ; %entry
3895; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3896; GFX7-NEXT:    s_mov_b32 m0, -1
3897; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3899; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3900; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3901; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3902; GFX7-NEXT:    s_endpgm
3903;
3904; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
3905; GFX10-WGP:       ; %bb.0: ; %entry
3906; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3907; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3908; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3909; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3910; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3911; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3912; GFX10-WGP-NEXT:    s_endpgm
3913;
3914; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
3915; GFX10-CU:       ; %bb.0: ; %entry
3916; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3917; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3918; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3919; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3920; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3921; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3922; GFX10-CU-NEXT:    s_endpgm
3923;
3924; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg:
3925; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3926; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3927; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3928; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3929; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3930; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3931; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3932; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3933; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3934; SKIP-CACHE-INV-NEXT:    s_endpgm
3935    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3936entry:
3937  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3938  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
3939  ret void
3940}
3941
3942define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
3943; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
3944; GFX6:       ; %bb.0: ; %entry
3945; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3946; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3947; GFX6-NEXT:    s_mov_b32 m0, -1
3948; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3949; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3950; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3951; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3952; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3953; GFX6-NEXT:    s_endpgm
3954;
3955; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
3956; GFX7:       ; %bb.0: ; %entry
3957; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3958; GFX7-NEXT:    s_mov_b32 m0, -1
3959; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3960; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3961; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3962; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3963; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3964; GFX7-NEXT:    s_endpgm
3965;
3966; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
3967; GFX10-WGP:       ; %bb.0: ; %entry
3968; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3969; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3970; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3971; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3972; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3973; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3974; GFX10-WGP-NEXT:    s_endpgm
3975;
3976; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
3977; GFX10-CU:       ; %bb.0: ; %entry
3978; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3979; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3981; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3982; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3983; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3984; GFX10-CU-NEXT:    s_endpgm
3985;
3986; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
3987; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3988; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3989; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3990; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3991; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3992; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3993; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3994; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3995; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
3996; SKIP-CACHE-INV-NEXT:    s_endpgm
3997    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3998entry:
3999  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4000  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
4001  ret void
4002}
4003
4004define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
4005; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
4006; GFX6:       ; %bb.0: ; %entry
4007; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4008; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4009; GFX6-NEXT:    s_mov_b32 m0, -1
4010; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4011; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4012; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4013; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4014; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4015; GFX6-NEXT:    s_endpgm
4016;
4017; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
4018; GFX7:       ; %bb.0: ; %entry
4019; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4020; GFX7-NEXT:    s_mov_b32 m0, -1
4021; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4022; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4023; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4024; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4025; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4026; GFX7-NEXT:    s_endpgm
4027;
4028; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
4029; GFX10-WGP:       ; %bb.0: ; %entry
4030; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4031; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4032; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4033; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4034; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4035; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4036; GFX10-WGP-NEXT:    s_endpgm
4037;
4038; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
4039; GFX10-CU:       ; %bb.0: ; %entry
4040; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4041; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4042; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4043; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4044; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4045; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4046; GFX10-CU-NEXT:    s_endpgm
4047;
4048; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
4049; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4050; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4051; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4052; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4053; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4054; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4056; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4057; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4058; SKIP-CACHE-INV-NEXT:    s_endpgm
4059    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4060entry:
4061  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4062  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
4063  ret void
4064}
4065
4066define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
4067; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
4068; GFX6:       ; %bb.0: ; %entry
4069; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4070; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4071; GFX6-NEXT:    s_mov_b32 m0, -1
4072; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4073; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4074; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4075; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4076; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4077; GFX6-NEXT:    s_endpgm
4078;
4079; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
4080; GFX7:       ; %bb.0: ; %entry
4081; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4082; GFX7-NEXT:    s_mov_b32 m0, -1
4083; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4084; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4085; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4086; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4087; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4088; GFX7-NEXT:    s_endpgm
4089;
4090; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
4091; GFX10-WGP:       ; %bb.0: ; %entry
4092; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4093; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4094; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4095; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4096; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4097; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4098; GFX10-WGP-NEXT:    s_endpgm
4099;
4100; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
4101; GFX10-CU:       ; %bb.0: ; %entry
4102; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4103; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4104; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4105; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4106; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4107; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4108; GFX10-CU-NEXT:    s_endpgm
4109;
4110; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
4111; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4112; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4113; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4114; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4115; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4116; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4117; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4118; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4119; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4120; SKIP-CACHE-INV-NEXT:    s_endpgm
4121    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4122entry:
4123  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4124  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
4125  ret void
4126}
4127
4128define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
4129; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
4130; GFX6:       ; %bb.0: ; %entry
4131; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4132; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4133; GFX6-NEXT:    s_mov_b32 m0, -1
4134; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4135; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4136; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4137; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4138; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4139; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4140; GFX6-NEXT:    ds_write_b32 v0, v1
4141; GFX6-NEXT:    s_endpgm
4142;
4143; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
4144; GFX7:       ; %bb.0: ; %entry
4145; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4146; GFX7-NEXT:    s_mov_b32 m0, -1
4147; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4148; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4149; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4150; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4151; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4152; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4153; GFX7-NEXT:    ds_write_b32 v0, v1
4154; GFX7-NEXT:    s_endpgm
4155;
4156; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
4157; GFX10-WGP:       ; %bb.0: ; %entry
4158; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4159; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4160; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4161; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4162; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4163; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4164; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4165; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4166; GFX10-WGP-NEXT:    s_endpgm
4167;
4168; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
4169; GFX10-CU:       ; %bb.0: ; %entry
4170; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4171; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4172; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4173; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4174; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4175; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4176; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4177; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4178; GFX10-CU-NEXT:    s_endpgm
4179;
4180; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
4181; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4182; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4183; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4184; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4185; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4186; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4187; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4189; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4190; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4191; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4192; SKIP-CACHE-INV-NEXT:    s_endpgm
4193    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4194entry:
4195  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4196  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
4197  %val0 = extractvalue { i32, i1 } %val, 0
4198  store i32 %val0, i32 addrspace(3)* %out, align 4
4199  ret void
4200}
4201
4202define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
4203; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4204; GFX6:       ; %bb.0: ; %entry
4205; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4206; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4207; GFX6-NEXT:    s_mov_b32 m0, -1
4208; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4209; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4210; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4211; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4212; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4213; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4214; GFX6-NEXT:    ds_write_b32 v0, v1
4215; GFX6-NEXT:    s_endpgm
4216;
4217; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4218; GFX7:       ; %bb.0: ; %entry
4219; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4220; GFX7-NEXT:    s_mov_b32 m0, -1
4221; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4222; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4223; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4224; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4225; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4226; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4227; GFX7-NEXT:    ds_write_b32 v0, v1
4228; GFX7-NEXT:    s_endpgm
4229;
4230; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4231; GFX10-WGP:       ; %bb.0: ; %entry
4232; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4233; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4234; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4235; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4236; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4237; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4238; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4239; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4240; GFX10-WGP-NEXT:    s_endpgm
4241;
4242; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4243; GFX10-CU:       ; %bb.0: ; %entry
4244; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4245; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4246; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4247; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4248; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4249; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4250; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4251; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4252; GFX10-CU-NEXT:    s_endpgm
4253;
4254; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4255; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4256; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4257; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4258; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4259; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4260; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4261; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4262; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4263; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4264; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4265; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4266; SKIP-CACHE-INV-NEXT:    s_endpgm
4267    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4268entry:
4269  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4270  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
4271  %val0 = extractvalue { i32, i1 } %val, 0
4272  store i32 %val0, i32 addrspace(3)* %out, align 4
4273  ret void
4274}
4275
4276define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
4277; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4278; GFX6:       ; %bb.0: ; %entry
4279; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4280; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4281; GFX6-NEXT:    s_mov_b32 m0, -1
4282; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4283; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4284; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4285; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4286; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4287; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4288; GFX6-NEXT:    ds_write_b32 v0, v1
4289; GFX6-NEXT:    s_endpgm
4290;
4291; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4292; GFX7:       ; %bb.0: ; %entry
4293; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4294; GFX7-NEXT:    s_mov_b32 m0, -1
4295; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4296; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4297; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4298; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4299; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4300; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4301; GFX7-NEXT:    ds_write_b32 v0, v1
4302; GFX7-NEXT:    s_endpgm
4303;
4304; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4305; GFX10-WGP:       ; %bb.0: ; %entry
4306; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4307; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4308; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4310; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4311; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4312; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4313; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4314; GFX10-WGP-NEXT:    s_endpgm
4315;
4316; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4317; GFX10-CU:       ; %bb.0: ; %entry
4318; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4319; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4320; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4321; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4322; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4323; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4324; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4325; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4326; GFX10-CU-NEXT:    s_endpgm
4327;
4328; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4329; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4330; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4331; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4332; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4333; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4334; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4336; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4337; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4338; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4339; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4340; SKIP-CACHE-INV-NEXT:    s_endpgm
4341    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4342entry:
4343  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4344  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
4345  %val0 = extractvalue { i32, i1 } %val, 0
4346  store i32 %val0, i32 addrspace(3)* %out, align 4
4347  ret void
4348}
4349
4350define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
4351; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
4352; GFX6:       ; %bb.0: ; %entry
4353; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4354; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4355; GFX6-NEXT:    s_mov_b32 m0, -1
4356; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4357; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4358; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4359; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4360; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4361; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4362; GFX6-NEXT:    ds_write_b32 v0, v1
4363; GFX6-NEXT:    s_endpgm
4364;
4365; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
4366; GFX7:       ; %bb.0: ; %entry
4367; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4368; GFX7-NEXT:    s_mov_b32 m0, -1
4369; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4370; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4371; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4372; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4373; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4375; GFX7-NEXT:    ds_write_b32 v0, v1
4376; GFX7-NEXT:    s_endpgm
4377;
4378; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
4379; GFX10-WGP:       ; %bb.0: ; %entry
4380; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4381; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4382; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4383; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4384; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4385; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4386; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4387; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4388; GFX10-WGP-NEXT:    s_endpgm
4389;
4390; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
4391; GFX10-CU:       ; %bb.0: ; %entry
4392; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4393; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4394; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4395; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4396; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4397; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4398; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4399; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4400; GFX10-CU-NEXT:    s_endpgm
4401;
4402; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
4403; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4404; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4405; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4406; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4407; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4409; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4410; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4411; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4412; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4413; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4414; SKIP-CACHE-INV-NEXT:    s_endpgm
4415    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4416entry:
4417  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4418  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
4419  %val0 = extractvalue { i32, i1 } %val, 0
4420  store i32 %val0, i32 addrspace(3)* %out, align 4
4421  ret void
4422}
4423
4424define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
4425; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
4426; GFX6:       ; %bb.0: ; %entry
4427; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4428; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4429; GFX6-NEXT:    s_mov_b32 m0, -1
4430; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4431; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4432; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4433; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4434; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4435; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4436; GFX6-NEXT:    ds_write_b32 v0, v1
4437; GFX6-NEXT:    s_endpgm
4438;
4439; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
4440; GFX7:       ; %bb.0: ; %entry
4441; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4442; GFX7-NEXT:    s_mov_b32 m0, -1
4443; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4445; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4446; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4447; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4448; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4449; GFX7-NEXT:    ds_write_b32 v0, v1
4450; GFX7-NEXT:    s_endpgm
4451;
4452; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
4453; GFX10-WGP:       ; %bb.0: ; %entry
4454; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4455; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4456; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4457; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4458; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4459; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4460; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4461; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4462; GFX10-WGP-NEXT:    s_endpgm
4463;
4464; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
4465; GFX10-CU:       ; %bb.0: ; %entry
4466; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4467; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4468; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4469; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4470; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4471; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4472; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4473; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4474; GFX10-CU-NEXT:    s_endpgm
4475;
4476; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
4477; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4478; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4479; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4480; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4481; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4482; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4483; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4484; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4485; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4486; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4487; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4488; SKIP-CACHE-INV-NEXT:    s_endpgm
4489    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4490entry:
4491  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4492  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
4493  %val0 = extractvalue { i32, i1 } %val, 0
4494  store i32 %val0, i32 addrspace(3)* %out, align 4
4495  ret void
4496}
4497
4498define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
4499; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
4500; GFX6:       ; %bb.0: ; %entry
4501; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4502; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4503; GFX6-NEXT:    s_mov_b32 m0, -1
4504; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4505; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4506; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4507; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4508; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4509; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4510; GFX6-NEXT:    ds_write_b32 v0, v1
4511; GFX6-NEXT:    s_endpgm
4512;
4513; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
4514; GFX7:       ; %bb.0: ; %entry
4515; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4516; GFX7-NEXT:    s_mov_b32 m0, -1
4517; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4518; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4519; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4520; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4521; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4522; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4523; GFX7-NEXT:    ds_write_b32 v0, v1
4524; GFX7-NEXT:    s_endpgm
4525;
4526; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
4527; GFX10-WGP:       ; %bb.0: ; %entry
4528; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4529; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4530; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4531; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4532; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4533; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4534; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4535; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4536; GFX10-WGP-NEXT:    s_endpgm
4537;
4538; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
4539; GFX10-CU:       ; %bb.0: ; %entry
4540; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4541; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4542; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4543; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4544; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4545; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4546; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4547; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4548; GFX10-CU-NEXT:    s_endpgm
4549;
4550; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
4551; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4552; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4553; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4554; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4555; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4557; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4558; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4559; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4560; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4561; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4562; SKIP-CACHE-INV-NEXT:    s_endpgm
4563    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4564entry:
4565  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4566  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
4567  %val0 = extractvalue { i32, i1 } %val, 0
4568  store i32 %val0, i32 addrspace(3)* %out, align 4
4569  ret void
4570}
4571
4572define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
4573; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
4574; GFX6:       ; %bb.0: ; %entry
4575; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4576; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4577; GFX6-NEXT:    s_mov_b32 m0, -1
4578; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4579; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4580; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4581; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4582; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4583; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4584; GFX6-NEXT:    ds_write_b32 v0, v1
4585; GFX6-NEXT:    s_endpgm
4586;
4587; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
4588; GFX7:       ; %bb.0: ; %entry
4589; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4590; GFX7-NEXT:    s_mov_b32 m0, -1
4591; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4592; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4593; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4594; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4595; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4596; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4597; GFX7-NEXT:    ds_write_b32 v0, v1
4598; GFX7-NEXT:    s_endpgm
4599;
4600; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
4601; GFX10-WGP:       ; %bb.0: ; %entry
4602; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4603; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4604; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4605; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4606; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4607; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4608; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4609; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4610; GFX10-WGP-NEXT:    s_endpgm
4611;
4612; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
4613; GFX10-CU:       ; %bb.0: ; %entry
4614; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4615; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4616; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4617; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4618; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4619; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4620; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4621; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4622; GFX10-CU-NEXT:    s_endpgm
4623;
4624; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
4625; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4626; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4627; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4628; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4629; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4630; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4631; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4632; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4633; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4634; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4635; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4636; SKIP-CACHE-INV-NEXT:    s_endpgm
4637    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4638entry:
4639  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4640  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
4641  %val0 = extractvalue { i32, i1 } %val, 0
4642  store i32 %val0, i32 addrspace(3)* %out, align 4
4643  ret void
4644}
4645
4646define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
4647; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
4648; GFX6:       ; %bb.0: ; %entry
4649; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4650; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4651; GFX6-NEXT:    s_mov_b32 m0, -1
4652; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4653; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4654; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4655; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4656; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4657; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4658; GFX6-NEXT:    ds_write_b32 v0, v1
4659; GFX6-NEXT:    s_endpgm
4660;
4661; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
4662; GFX7:       ; %bb.0: ; %entry
4663; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4664; GFX7-NEXT:    s_mov_b32 m0, -1
4665; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4666; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4667; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4668; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4669; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4670; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4671; GFX7-NEXT:    ds_write_b32 v0, v1
4672; GFX7-NEXT:    s_endpgm
4673;
4674; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
4675; GFX10-WGP:       ; %bb.0: ; %entry
4676; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4677; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4678; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4679; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4680; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4681; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4682; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4683; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4684; GFX10-WGP-NEXT:    s_endpgm
4685;
4686; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
4687; GFX10-CU:       ; %bb.0: ; %entry
4688; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4689; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4690; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4691; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4692; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4693; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4694; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4695; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4696; GFX10-CU-NEXT:    s_endpgm
4697;
4698; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
4699; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4700; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4701; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4702; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4703; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4705; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4706; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4707; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
4708; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4709; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4710; SKIP-CACHE-INV-NEXT:    s_endpgm
4711    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4712entry:
4713  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4714  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
4715  %val0 = extractvalue { i32, i1 } %val, 0
4716  store i32 %val0, i32 addrspace(3)* %out, align 4
4717  ret void
4718}
4719
4720