1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9
10define amdgpu_kernel void @local_workgroup_unordered_load(
11; GFX6-LABEL: local_workgroup_unordered_load:
12; GFX6:       ; %bb.0: ; %entry
13; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
14; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
15; GFX6-NEXT:    s_mov_b32 m0, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    v_mov_b32_e32 v0, s0
18; GFX6-NEXT:    ds_read_b32 v0, v0
19; GFX6-NEXT:    v_mov_b32_e32 v1, s1
20; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX6-NEXT:    ds_write_b32 v1, v0
22; GFX6-NEXT:    s_endpgm
23;
24; GFX7-LABEL: local_workgroup_unordered_load:
25; GFX7:       ; %bb.0: ; %entry
26; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
27; GFX7-NEXT:    s_mov_b32 m0, -1
28; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    ds_read_b32 v0, v0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    ds_write_b32 v1, v0
34; GFX7-NEXT:    s_endpgm
35;
36; GFX10-WGP-LABEL: local_workgroup_unordered_load:
37; GFX10-WGP:       ; %bb.0: ; %entry
38; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
39; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
45; GFX10-WGP-NEXT:    s_endpgm
46;
47; GFX10-CU-LABEL: local_workgroup_unordered_load:
48; GFX10-CU:       ; %bb.0: ; %entry
49; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
50; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
52; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
53; GFX10-CU-NEXT:    ds_read_b32 v0, v0
54; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-CU-NEXT:    ds_write_b32 v1, v0
56; GFX10-CU-NEXT:    s_endpgm
57;
58; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_load:
59; SKIP-CACHE-INV:       ; %bb.0: ; %entry
60; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
61; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
64; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
65; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
66; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
67; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
68; SKIP-CACHE-INV-NEXT:    s_endpgm
69;
70; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load:
71; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
72; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
73; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
75; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
76; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
77; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
79; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
80;
81; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load:
82; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
83; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
84; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
86; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
87; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
90; GFX90A-TGSPLIT-NEXT:    s_endpgm
91    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
92entry:
93  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4
94  store i32 %val, i32 addrspace(3)* %out
95  ret void
96}
97
98define amdgpu_kernel void @local_workgroup_monotonic_load(
99; GFX6-LABEL: local_workgroup_monotonic_load:
100; GFX6:       ; %bb.0: ; %entry
101; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
102; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
103; GFX6-NEXT:    s_mov_b32 m0, -1
104; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX6-NEXT:    v_mov_b32_e32 v0, s0
106; GFX6-NEXT:    ds_read_b32 v0, v0
107; GFX6-NEXT:    v_mov_b32_e32 v1, s1
108; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX6-NEXT:    ds_write_b32 v1, v0
110; GFX6-NEXT:    s_endpgm
111;
112; GFX7-LABEL: local_workgroup_monotonic_load:
113; GFX7:       ; %bb.0: ; %entry
114; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
115; GFX7-NEXT:    s_mov_b32 m0, -1
116; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX7-NEXT:    v_mov_b32_e32 v0, s0
118; GFX7-NEXT:    ds_read_b32 v0, v0
119; GFX7-NEXT:    v_mov_b32_e32 v1, s1
120; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX7-NEXT:    ds_write_b32 v1, v0
122; GFX7-NEXT:    s_endpgm
123;
124; GFX10-WGP-LABEL: local_workgroup_monotonic_load:
125; GFX10-WGP:       ; %bb.0: ; %entry
126; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
127; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
129; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
130; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
131; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
133; GFX10-WGP-NEXT:    s_endpgm
134;
135; GFX10-CU-LABEL: local_workgroup_monotonic_load:
136; GFX10-CU:       ; %bb.0: ; %entry
137; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
138; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
140; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
141; GFX10-CU-NEXT:    ds_read_b32 v0, v0
142; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX10-CU-NEXT:    ds_write_b32 v1, v0
144; GFX10-CU-NEXT:    s_endpgm
145;
146; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_load:
147; SKIP-CACHE-INV:       ; %bb.0: ; %entry
148; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
149; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
150; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
151; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
152; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
153; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
154; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
155; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
156; SKIP-CACHE-INV-NEXT:    s_endpgm
157;
158; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load:
159; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
160; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
161; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
163; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
164; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
165; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
167; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
168;
169; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load:
170; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
171; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
172; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
174; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
175; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
176; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
178; GFX90A-TGSPLIT-NEXT:    s_endpgm
179    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
180entry:
181  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4
182  store i32 %val, i32 addrspace(3)* %out
183  ret void
184}
185
186define amdgpu_kernel void @local_workgroup_acquire_load(
187; GFX6-LABEL: local_workgroup_acquire_load:
188; GFX6:       ; %bb.0: ; %entry
189; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
190; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
191; GFX6-NEXT:    s_mov_b32 m0, -1
192; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX6-NEXT:    v_mov_b32_e32 v0, s0
194; GFX6-NEXT:    ds_read_b32 v0, v0
195; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX6-NEXT:    v_mov_b32_e32 v1, s1
197; GFX6-NEXT:    ds_write_b32 v1, v0
198; GFX6-NEXT:    s_endpgm
199;
200; GFX7-LABEL: local_workgroup_acquire_load:
201; GFX7:       ; %bb.0: ; %entry
202; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
203; GFX7-NEXT:    s_mov_b32 m0, -1
204; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX7-NEXT:    v_mov_b32_e32 v0, s0
206; GFX7-NEXT:    ds_read_b32 v0, v0
207; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX7-NEXT:    v_mov_b32_e32 v1, s1
209; GFX7-NEXT:    ds_write_b32 v1, v0
210; GFX7-NEXT:    s_endpgm
211;
212; GFX10-WGP-LABEL: local_workgroup_acquire_load:
213; GFX10-WGP:       ; %bb.0: ; %entry
214; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
215; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
217; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
218; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
219; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX10-WGP-NEXT:    buffer_gl0_inv
221; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
222; GFX10-WGP-NEXT:    s_endpgm
223;
224; GFX10-CU-LABEL: local_workgroup_acquire_load:
225; GFX10-CU:       ; %bb.0: ; %entry
226; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
227; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
229; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
230; GFX10-CU-NEXT:    ds_read_b32 v0, v0
231; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX10-CU-NEXT:    ds_write_b32 v1, v0
233; GFX10-CU-NEXT:    s_endpgm
234;
235; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_load:
236; SKIP-CACHE-INV:       ; %bb.0: ; %entry
237; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
238; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
239; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
240; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
241; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
242; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
243; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
244; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
245; SKIP-CACHE-INV-NEXT:    s_endpgm
246;
247; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load:
248; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
249; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
250; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
252; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
253; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
255; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
256; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
257;
258; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load:
259; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
260; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
261; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
263; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
264; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
265; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
266; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
268; GFX90A-TGSPLIT-NEXT:    s_endpgm
269    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
270entry:
271  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
272  store i32 %val, i32 addrspace(3)* %out
273  ret void
274}
275
276define amdgpu_kernel void @local_workgroup_seq_cst_load(
277; GFX6-LABEL: local_workgroup_seq_cst_load:
278; GFX6:       ; %bb.0: ; %entry
279; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
280; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
281; GFX6-NEXT:    s_mov_b32 m0, -1
282; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX6-NEXT:    v_mov_b32_e32 v0, s0
284; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX6-NEXT:    ds_read_b32 v0, v0
286; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX6-NEXT:    v_mov_b32_e32 v1, s1
288; GFX6-NEXT:    ds_write_b32 v1, v0
289; GFX6-NEXT:    s_endpgm
290;
291; GFX7-LABEL: local_workgroup_seq_cst_load:
292; GFX7:       ; %bb.0: ; %entry
293; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
294; GFX7-NEXT:    s_mov_b32 m0, -1
295; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX7-NEXT:    v_mov_b32_e32 v0, s0
297; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX7-NEXT:    ds_read_b32 v0, v0
299; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX7-NEXT:    v_mov_b32_e32 v1, s1
301; GFX7-NEXT:    ds_write_b32 v1, v0
302; GFX7-NEXT:    s_endpgm
303;
304; GFX10-WGP-LABEL: local_workgroup_seq_cst_load:
305; GFX10-WGP:       ; %bb.0: ; %entry
306; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
307; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
310; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
311; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
312; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
313; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX10-WGP-NEXT:    buffer_gl0_inv
315; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
316; GFX10-WGP-NEXT:    s_endpgm
317;
318; GFX10-CU-LABEL: local_workgroup_seq_cst_load:
319; GFX10-CU:       ; %bb.0: ; %entry
320; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
321; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
323; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
324; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX10-CU-NEXT:    ds_read_b32 v0, v0
326; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX10-CU-NEXT:    ds_write_b32 v1, v0
328; GFX10-CU-NEXT:    s_endpgm
329;
330; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_load:
331; SKIP-CACHE-INV:       ; %bb.0: ; %entry
332; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
333; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
334; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
336; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
337; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
338; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
339; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
340; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
341; SKIP-CACHE-INV-NEXT:    s_endpgm
342;
343; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load:
344; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
345; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
346; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
348; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
350; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
352; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
353; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
354;
355; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load:
356; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
357; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
358; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
360; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
361; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
362; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
363; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
364; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
366; GFX90A-TGSPLIT-NEXT:    s_endpgm
367    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
368entry:
369  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4
370  store i32 %val, i32 addrspace(3)* %out
371  ret void
372}
373
374define amdgpu_kernel void @local_workgroup_unordered_store(
375; GFX6-LABEL: local_workgroup_unordered_store:
376; GFX6:       ; %bb.0: ; %entry
377; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
378; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
379; GFX6-NEXT:    s_mov_b32 m0, -1
380; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX6-NEXT:    v_mov_b32_e32 v1, s0
382; GFX6-NEXT:    v_mov_b32_e32 v0, s1
383; GFX6-NEXT:    ds_write_b32 v0, v1
384; GFX6-NEXT:    s_endpgm
385;
386; GFX7-LABEL: local_workgroup_unordered_store:
387; GFX7:       ; %bb.0: ; %entry
388; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
389; GFX7-NEXT:    s_mov_b32 m0, -1
390; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX7-NEXT:    v_mov_b32_e32 v0, s1
392; GFX7-NEXT:    v_mov_b32_e32 v1, s0
393; GFX7-NEXT:    ds_write_b32 v0, v1
394; GFX7-NEXT:    s_endpgm
395;
396; GFX10-WGP-LABEL: local_workgroup_unordered_store:
397; GFX10-WGP:       ; %bb.0: ; %entry
398; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
399; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
401; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
402; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
403; GFX10-WGP-NEXT:    s_endpgm
404;
405; GFX10-CU-LABEL: local_workgroup_unordered_store:
406; GFX10-CU:       ; %bb.0: ; %entry
407; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
408; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
410; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
411; GFX10-CU-NEXT:    ds_write_b32 v0, v1
412; GFX10-CU-NEXT:    s_endpgm
413;
414; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_store:
415; SKIP-CACHE-INV:       ; %bb.0: ; %entry
416; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
417; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
418; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
419; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
420; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
421; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
422; SKIP-CACHE-INV-NEXT:    s_endpgm
423;
424; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store:
425; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
426; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
427; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
429; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
430; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
431; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
432;
433; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store:
434; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
435; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
436; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
438; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
439; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
440; GFX90A-TGSPLIT-NEXT:    s_endpgm
441    i32 %in, i32 addrspace(3)* %out) {
442entry:
443  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4
444  ret void
445}
446
447define amdgpu_kernel void @local_workgroup_monotonic_store(
448; GFX6-LABEL: local_workgroup_monotonic_store:
449; GFX6:       ; %bb.0: ; %entry
450; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
451; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
452; GFX6-NEXT:    s_mov_b32 m0, -1
453; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX6-NEXT:    v_mov_b32_e32 v1, s0
455; GFX6-NEXT:    v_mov_b32_e32 v0, s1
456; GFX6-NEXT:    ds_write_b32 v0, v1
457; GFX6-NEXT:    s_endpgm
458;
459; GFX7-LABEL: local_workgroup_monotonic_store:
460; GFX7:       ; %bb.0: ; %entry
461; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
462; GFX7-NEXT:    s_mov_b32 m0, -1
463; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX7-NEXT:    v_mov_b32_e32 v0, s1
465; GFX7-NEXT:    v_mov_b32_e32 v1, s0
466; GFX7-NEXT:    ds_write_b32 v0, v1
467; GFX7-NEXT:    s_endpgm
468;
469; GFX10-WGP-LABEL: local_workgroup_monotonic_store:
470; GFX10-WGP:       ; %bb.0: ; %entry
471; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
472; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
474; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
475; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
476; GFX10-WGP-NEXT:    s_endpgm
477;
478; GFX10-CU-LABEL: local_workgroup_monotonic_store:
479; GFX10-CU:       ; %bb.0: ; %entry
480; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
481; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
483; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
484; GFX10-CU-NEXT:    ds_write_b32 v0, v1
485; GFX10-CU-NEXT:    s_endpgm
486;
487; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_store:
488; SKIP-CACHE-INV:       ; %bb.0: ; %entry
489; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
490; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
491; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
493; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
494; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
495; SKIP-CACHE-INV-NEXT:    s_endpgm
496;
497; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store:
498; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
499; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
500; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
502; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
503; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
504; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
505;
506; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store:
507; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
508; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
511; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
512; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
513; GFX90A-TGSPLIT-NEXT:    s_endpgm
514    i32 %in, i32 addrspace(3)* %out) {
515entry:
516  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4
517  ret void
518}
519
520define amdgpu_kernel void @local_workgroup_release_store(
521; GFX6-LABEL: local_workgroup_release_store:
522; GFX6:       ; %bb.0: ; %entry
523; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
524; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
525; GFX6-NEXT:    s_mov_b32 m0, -1
526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX6-NEXT:    v_mov_b32_e32 v1, s0
528; GFX6-NEXT:    v_mov_b32_e32 v0, s1
529; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
530; GFX6-NEXT:    ds_write_b32 v0, v1
531; GFX6-NEXT:    s_endpgm
532;
533; GFX7-LABEL: local_workgroup_release_store:
534; GFX7:       ; %bb.0: ; %entry
535; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
536; GFX7-NEXT:    s_mov_b32 m0, -1
537; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX7-NEXT:    v_mov_b32_e32 v0, s1
539; GFX7-NEXT:    v_mov_b32_e32 v1, s0
540; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
541; GFX7-NEXT:    ds_write_b32 v0, v1
542; GFX7-NEXT:    s_endpgm
543;
544; GFX10-WGP-LABEL: local_workgroup_release_store:
545; GFX10-WGP:       ; %bb.0: ; %entry
546; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
547; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
549; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
550; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
551; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
552; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
553; GFX10-WGP-NEXT:    s_endpgm
554;
555; GFX10-CU-LABEL: local_workgroup_release_store:
556; GFX10-CU:       ; %bb.0: ; %entry
557; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
558; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
560; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
561; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX10-CU-NEXT:    ds_write_b32 v0, v1
563; GFX10-CU-NEXT:    s_endpgm
564;
565; SKIP-CACHE-INV-LABEL: local_workgroup_release_store:
566; SKIP-CACHE-INV:       ; %bb.0: ; %entry
567; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
568; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
569; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
571; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
572; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
573; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
574; SKIP-CACHE-INV-NEXT:    s_endpgm
575;
576; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store:
577; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
578; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
579; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
581; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
582; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
584; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
585;
586; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store:
587; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
588; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
589; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
591; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
592; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
593; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
594; GFX90A-TGSPLIT-NEXT:    s_endpgm
595    i32 %in, i32 addrspace(3)* %out) {
596entry:
597  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
598  ret void
599}
600
601define amdgpu_kernel void @local_workgroup_seq_cst_store(
602; GFX6-LABEL: local_workgroup_seq_cst_store:
603; GFX6:       ; %bb.0: ; %entry
604; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
605; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
606; GFX6-NEXT:    s_mov_b32 m0, -1
607; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
608; GFX6-NEXT:    v_mov_b32_e32 v1, s0
609; GFX6-NEXT:    v_mov_b32_e32 v0, s1
610; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX6-NEXT:    ds_write_b32 v0, v1
612; GFX6-NEXT:    s_endpgm
613;
614; GFX7-LABEL: local_workgroup_seq_cst_store:
615; GFX7:       ; %bb.0: ; %entry
616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
617; GFX7-NEXT:    s_mov_b32 m0, -1
618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX7-NEXT:    v_mov_b32_e32 v0, s1
620; GFX7-NEXT:    v_mov_b32_e32 v1, s0
621; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX7-NEXT:    ds_write_b32 v0, v1
623; GFX7-NEXT:    s_endpgm
624;
625; GFX10-WGP-LABEL: local_workgroup_seq_cst_store:
626; GFX10-WGP:       ; %bb.0: ; %entry
627; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
628; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
630; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
631; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
632; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
633; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
634; GFX10-WGP-NEXT:    s_endpgm
635;
636; GFX10-CU-LABEL: local_workgroup_seq_cst_store:
637; GFX10-CU:       ; %bb.0: ; %entry
638; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
639; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
641; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
642; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX10-CU-NEXT:    ds_write_b32 v0, v1
644; GFX10-CU-NEXT:    s_endpgm
645;
646; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_store:
647; SKIP-CACHE-INV:       ; %bb.0: ; %entry
648; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
649; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
650; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
651; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
652; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
653; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
654; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
655; SKIP-CACHE-INV-NEXT:    s_endpgm
656;
657; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store:
658; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
659; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
660; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
661; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
662; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
663; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
665; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
666;
667; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store:
668; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
669; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
670; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
672; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
673; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
674; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
675; GFX90A-TGSPLIT-NEXT:    s_endpgm
676    i32 %in, i32 addrspace(3)* %out) {
677entry:
678  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4
679  ret void
680}
681
682define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
683; GFX6-LABEL: local_workgroup_monotonic_atomicrmw:
684; GFX6:       ; %bb.0: ; %entry
685; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
686; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
687; GFX6-NEXT:    s_mov_b32 m0, -1
688; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX6-NEXT:    v_mov_b32_e32 v0, s0
690; GFX6-NEXT:    v_mov_b32_e32 v1, s1
691; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
692; GFX6-NEXT:    s_endpgm
693;
694; GFX7-LABEL: local_workgroup_monotonic_atomicrmw:
695; GFX7:       ; %bb.0: ; %entry
696; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
697; GFX7-NEXT:    s_mov_b32 m0, -1
698; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX7-NEXT:    v_mov_b32_e32 v0, s0
700; GFX7-NEXT:    v_mov_b32_e32 v1, s1
701; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
702; GFX7-NEXT:    s_endpgm
703;
704; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw:
705; GFX10-WGP:       ; %bb.0: ; %entry
706; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
707; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
709; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
710; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
711; GFX10-WGP-NEXT:    s_endpgm
712;
713; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw:
714; GFX10-CU:       ; %bb.0: ; %entry
715; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
716; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
718; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
719; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
720; GFX10-CU-NEXT:    s_endpgm
721;
722; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_atomicrmw:
723; SKIP-CACHE-INV:       ; %bb.0: ; %entry
724; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
725; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
726; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
727; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
728; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
729; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
730; SKIP-CACHE-INV-NEXT:    s_endpgm
731;
732; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
733; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
734; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
735; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
736; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
737; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
738; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
739; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
740;
741; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
742; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
743; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
744; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
746; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
747; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
748; GFX90A-TGSPLIT-NEXT:    s_endpgm
749    i32 addrspace(3)* %out, i32 %in) {
750entry:
751  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic
752  ret void
753}
754
755define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
756; GFX6-LABEL: local_workgroup_acquire_atomicrmw:
757; GFX6:       ; %bb.0: ; %entry
758; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
759; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
760; GFX6-NEXT:    s_mov_b32 m0, -1
761; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX6-NEXT:    v_mov_b32_e32 v0, s0
763; GFX6-NEXT:    v_mov_b32_e32 v1, s1
764; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
765; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX6-NEXT:    s_endpgm
767;
768; GFX7-LABEL: local_workgroup_acquire_atomicrmw:
769; GFX7:       ; %bb.0: ; %entry
770; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
771; GFX7-NEXT:    s_mov_b32 m0, -1
772; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX7-NEXT:    v_mov_b32_e32 v0, s0
774; GFX7-NEXT:    v_mov_b32_e32 v1, s1
775; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
776; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX7-NEXT:    s_endpgm
778;
779; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw:
780; GFX10-WGP:       ; %bb.0: ; %entry
781; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
782; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
784; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
785; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
786; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX10-WGP-NEXT:    buffer_gl0_inv
788; GFX10-WGP-NEXT:    s_endpgm
789;
790; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw:
791; GFX10-CU:       ; %bb.0: ; %entry
792; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
793; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
795; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
796; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
797; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
798; GFX10-CU-NEXT:    s_endpgm
799;
800; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_atomicrmw:
801; SKIP-CACHE-INV:       ; %bb.0: ; %entry
802; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
803; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
804; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
806; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
807; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
808; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
809; SKIP-CACHE-INV-NEXT:    s_endpgm
810;
811; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
812; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
813; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
814; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
816; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
817; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
818; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
820;
821; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
822; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
823; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
824; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
825; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
826; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
827; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
828; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
829; GFX90A-TGSPLIT-NEXT:    s_endpgm
830    i32 addrspace(3)* %out, i32 %in) {
831entry:
832  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
833  ret void
834}
835
836define amdgpu_kernel void @local_workgroup_release_atomicrmw(
837; GFX6-LABEL: local_workgroup_release_atomicrmw:
838; GFX6:       ; %bb.0: ; %entry
839; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
840; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
841; GFX6-NEXT:    s_mov_b32 m0, -1
842; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX6-NEXT:    v_mov_b32_e32 v0, s0
844; GFX6-NEXT:    v_mov_b32_e32 v1, s1
845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
847; GFX6-NEXT:    s_endpgm
848;
849; GFX7-LABEL: local_workgroup_release_atomicrmw:
850; GFX7:       ; %bb.0: ; %entry
851; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
852; GFX7-NEXT:    s_mov_b32 m0, -1
853; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX7-NEXT:    v_mov_b32_e32 v0, s0
855; GFX7-NEXT:    v_mov_b32_e32 v1, s1
856; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
858; GFX7-NEXT:    s_endpgm
859;
860; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw:
861; GFX10-WGP:       ; %bb.0: ; %entry
862; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
863; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
865; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
866; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
867; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
868; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
869; GFX10-WGP-NEXT:    s_endpgm
870;
871; GFX10-CU-LABEL: local_workgroup_release_atomicrmw:
872; GFX10-CU:       ; %bb.0: ; %entry
873; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
874; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
876; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
877; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
879; GFX10-CU-NEXT:    s_endpgm
880;
881; SKIP-CACHE-INV-LABEL: local_workgroup_release_atomicrmw:
882; SKIP-CACHE-INV:       ; %bb.0: ; %entry
883; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
884; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
885; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
886; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
887; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
888; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
889; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
890; SKIP-CACHE-INV-NEXT:    s_endpgm
891;
892; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw:
893; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
894; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
895; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
897; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
898; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
900; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
901;
902; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw:
903; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
904; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
905; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
907; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
908; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
909; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
910; GFX90A-TGSPLIT-NEXT:    s_endpgm
911    i32 addrspace(3)* %out, i32 %in) {
912entry:
913  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release
914  ret void
915}
916
917define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
918; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw:
919; GFX6:       ; %bb.0: ; %entry
920; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
921; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
922; GFX6-NEXT:    s_mov_b32 m0, -1
923; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX6-NEXT:    v_mov_b32_e32 v0, s0
925; GFX6-NEXT:    v_mov_b32_e32 v1, s1
926; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
928; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX6-NEXT:    s_endpgm
930;
931; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw:
932; GFX7:       ; %bb.0: ; %entry
933; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
934; GFX7-NEXT:    s_mov_b32 m0, -1
935; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX7-NEXT:    v_mov_b32_e32 v0, s0
937; GFX7-NEXT:    v_mov_b32_e32 v1, s1
938; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
940; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX7-NEXT:    s_endpgm
942;
943; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw:
944; GFX10-WGP:       ; %bb.0: ; %entry
945; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
946; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
948; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
949; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
950; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
951; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
952; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX10-WGP-NEXT:    buffer_gl0_inv
954; GFX10-WGP-NEXT:    s_endpgm
955;
956; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
957; GFX10-CU:       ; %bb.0: ; %entry
958; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
959; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
961; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
962; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
964; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
965; GFX10-CU-NEXT:    s_endpgm
966;
967; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_atomicrmw:
968; SKIP-CACHE-INV:       ; %bb.0: ; %entry
969; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
970; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
971; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
972; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
973; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
974; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
975; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
976; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
977; SKIP-CACHE-INV-NEXT:    s_endpgm
978;
979; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
980; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
981; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
982; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
983; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
984; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
985; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
987; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
989;
990; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
991; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
992; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
993; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
995; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
996; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
997; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
998; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
999; GFX90A-TGSPLIT-NEXT:    s_endpgm
1000    i32 addrspace(3)* %out, i32 %in) {
1001entry:
1002  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
1003  ret void
1004}
1005
1006define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
1007; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw:
1008; GFX6:       ; %bb.0: ; %entry
1009; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1010; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1011; GFX6-NEXT:    s_mov_b32 m0, -1
1012; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1014; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1015; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1016; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1017; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1018; GFX6-NEXT:    s_endpgm
1019;
1020; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw:
1021; GFX7:       ; %bb.0: ; %entry
1022; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1023; GFX7-NEXT:    s_mov_b32 m0, -1
1024; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1026; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1027; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1029; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX7-NEXT:    s_endpgm
1031;
1032; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw:
1033; GFX10-WGP:       ; %bb.0: ; %entry
1034; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1035; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1037; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1038; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1039; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1040; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1041; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1042; GFX10-WGP-NEXT:    buffer_gl0_inv
1043; GFX10-WGP-NEXT:    s_endpgm
1044;
1045; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
1046; GFX10-CU:       ; %bb.0: ; %entry
1047; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1048; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1050; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1051; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1053; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX10-CU-NEXT:    s_endpgm
1055;
1056; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_atomicrmw:
1057; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1058; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1059; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1060; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1061; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1062; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1063; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1064; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1065; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1066; SKIP-CACHE-INV-NEXT:    s_endpgm
1067;
1068; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
1069; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1070; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1071; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1072; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1073; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1074; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1076; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1078;
1079; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
1080; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1081; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1082; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1084; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1085; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1086; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1087; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1088; GFX90A-TGSPLIT-NEXT:    s_endpgm
1089    i32 addrspace(3)* %out, i32 %in) {
1090entry:
1091  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
1092  ret void
1093}
1094
1095define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
1096; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw:
1097; GFX6:       ; %bb.0: ; %entry
1098; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1099; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1100; GFX6-NEXT:    s_mov_b32 m0, -1
1101; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1103; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1104; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1105; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX6-NEXT:    ds_write_b32 v0, v1
1107; GFX6-NEXT:    s_endpgm
1108;
1109; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw:
1110; GFX7:       ; %bb.0: ; %entry
1111; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1112; GFX7-NEXT:    s_mov_b32 m0, -1
1113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1115; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1116; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1117; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX7-NEXT:    ds_write_b32 v0, v1
1119; GFX7-NEXT:    s_endpgm
1120;
1121; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw:
1122; GFX10-WGP:       ; %bb.0: ; %entry
1123; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1124; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1126; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1127; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1128; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX10-WGP-NEXT:    buffer_gl0_inv
1130; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1131; GFX10-WGP-NEXT:    s_endpgm
1132;
1133; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
1134; GFX10-CU:       ; %bb.0: ; %entry
1135; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1136; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1138; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1139; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1140; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1142; GFX10-CU-NEXT:    s_endpgm
1143;
1144; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_ret_atomicrmw:
1145; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1146; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1147; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1148; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1151; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1152; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1153; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1154; SKIP-CACHE-INV-NEXT:    s_endpgm
1155;
1156; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
1157; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1158; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1159; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1161; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1162; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1163; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1164; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1165; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1166;
1167; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
1168; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1169; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1170; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1171; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1172; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1173; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1174; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1175; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1176; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1177; GFX90A-TGSPLIT-NEXT:    s_endpgm
1178    i32 addrspace(3)* %out, i32 %in) {
1179entry:
1180  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
1181  store i32 %val, i32 addrspace(3)* %out, align 4
1182  ret void
1183}
1184
1185define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
1186; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1187; GFX6:       ; %bb.0: ; %entry
1188; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1189; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1190; GFX6-NEXT:    s_mov_b32 m0, -1
1191; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1192; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1193; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1194; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1196; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1197; GFX6-NEXT:    ds_write_b32 v0, v1
1198; GFX6-NEXT:    s_endpgm
1199;
1200; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1201; GFX7:       ; %bb.0: ; %entry
1202; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1203; GFX7-NEXT:    s_mov_b32 m0, -1
1204; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1205; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1206; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1207; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1208; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX7-NEXT:    ds_write_b32 v0, v1
1211; GFX7-NEXT:    s_endpgm
1212;
1213; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1214; GFX10-WGP:       ; %bb.0: ; %entry
1215; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1216; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1219; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1220; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1221; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1222; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1223; GFX10-WGP-NEXT:    buffer_gl0_inv
1224; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1225; GFX10-WGP-NEXT:    s_endpgm
1226;
1227; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1228; GFX10-CU:       ; %bb.0: ; %entry
1229; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1233; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1234; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1235; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1237; GFX10-CU-NEXT:    s_endpgm
1238;
1239; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1240; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1241; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1242; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1246; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1247; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1248; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1249; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1250; SKIP-CACHE-INV-NEXT:    s_endpgm
1251;
1252; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1253; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1254; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1255; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1257; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1258; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1260; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1262; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1263;
1264; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
1265; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1266; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1267; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1268; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1269; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1270; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1271; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1272; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1273; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1274; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1275; GFX90A-TGSPLIT-NEXT:    s_endpgm
1276    i32 addrspace(3)* %out, i32 %in) {
1277entry:
1278  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
1279  store i32 %val, i32 addrspace(3)* %out, align 4
1280  ret void
1281}
1282
1283define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
1284; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1285; GFX6:       ; %bb.0: ; %entry
1286; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1287; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1288; GFX6-NEXT:    s_mov_b32 m0, -1
1289; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1290; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1291; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1292; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1294; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1295; GFX6-NEXT:    ds_write_b32 v0, v1
1296; GFX6-NEXT:    s_endpgm
1297;
1298; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1299; GFX7:       ; %bb.0: ; %entry
1300; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1301; GFX7-NEXT:    s_mov_b32 m0, -1
1302; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1304; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1305; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1306; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1307; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1308; GFX7-NEXT:    ds_write_b32 v0, v1
1309; GFX7-NEXT:    s_endpgm
1310;
1311; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1312; GFX10-WGP:       ; %bb.0: ; %entry
1313; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1314; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1316; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1317; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1318; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1319; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1320; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX10-WGP-NEXT:    buffer_gl0_inv
1322; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1323; GFX10-WGP-NEXT:    s_endpgm
1324;
1325; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1326; GFX10-CU:       ; %bb.0: ; %entry
1327; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1328; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1330; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1331; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1332; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1333; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1334; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1335; GFX10-CU-NEXT:    s_endpgm
1336;
1337; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1338; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1339; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1340; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1341; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1342; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1343; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1344; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1345; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1346; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1347; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1348; SKIP-CACHE-INV-NEXT:    s_endpgm
1349;
1350; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1351; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1352; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1353; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1354; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1355; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1356; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1358; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1359; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1360; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1361;
1362; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
1363; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1364; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1365; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1366; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1367; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1368; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1369; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1370; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1371; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1373; GFX90A-TGSPLIT-NEXT:    s_endpgm
1374    i32 addrspace(3)* %out, i32 %in) {
1375entry:
1376  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
1377  store i32 %val, i32 addrspace(3)* %out, align 4
1378  ret void
1379}
1380
1381define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
1382; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1383; GFX6:       ; %bb.0: ; %entry
1384; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1385; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1386; GFX6-NEXT:    s_mov_b32 m0, -1
1387; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1389; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1390; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1391; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1392; GFX6-NEXT:    s_endpgm
1393;
1394; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1395; GFX7:       ; %bb.0: ; %entry
1396; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1397; GFX7-NEXT:    s_mov_b32 m0, -1
1398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1399; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1400; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1401; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1402; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1403; GFX7-NEXT:    s_endpgm
1404;
1405; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1406; GFX10-WGP:       ; %bb.0: ; %entry
1407; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1408; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1410; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1411; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1412; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1413; GFX10-WGP-NEXT:    s_endpgm
1414;
1415; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1416; GFX10-CU:       ; %bb.0: ; %entry
1417; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1418; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1419; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1420; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1421; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1422; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1423; GFX10-CU-NEXT:    s_endpgm
1424;
1425; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1426; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1427; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1428; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1429; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1430; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1432; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1433; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1434; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1435; SKIP-CACHE-INV-NEXT:    s_endpgm
1436;
1437; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1438; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1439; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1440; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1442; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1443; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1444; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1445; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1446;
1447; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
1448; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1449; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1450; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1452; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1453; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1454; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1455; GFX90A-TGSPLIT-NEXT:    s_endpgm
1456    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1457entry:
1458  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1459  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
1460  ret void
1461}
1462
1463define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
1464; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1465; GFX6:       ; %bb.0: ; %entry
1466; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1467; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1468; GFX6-NEXT:    s_mov_b32 m0, -1
1469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1470; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1471; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1472; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1473; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1474; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1475; GFX6-NEXT:    s_endpgm
1476;
1477; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1478; GFX7:       ; %bb.0: ; %entry
1479; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1480; GFX7-NEXT:    s_mov_b32 m0, -1
1481; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1483; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1484; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1485; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1486; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX7-NEXT:    s_endpgm
1488;
1489; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1490; GFX10-WGP:       ; %bb.0: ; %entry
1491; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1492; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1493; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1494; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1495; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1496; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1497; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX10-WGP-NEXT:    buffer_gl0_inv
1499; GFX10-WGP-NEXT:    s_endpgm
1500;
1501; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1502; GFX10-CU:       ; %bb.0: ; %entry
1503; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1504; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1506; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1507; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1508; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1509; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1510; GFX10-CU-NEXT:    s_endpgm
1511;
1512; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1513; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1514; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1515; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1516; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1517; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1518; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1519; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1520; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1521; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1522; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1523; SKIP-CACHE-INV-NEXT:    s_endpgm
1524;
1525; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1526; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1527; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1528; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1529; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1530; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1531; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1532; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1533; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1534; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1535;
1536; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
1537; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1538; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1539; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1540; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1541; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1542; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1543; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1544; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1545; GFX90A-TGSPLIT-NEXT:    s_endpgm
1546    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1547entry:
1548  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1549  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
1550  ret void
1551}
1552
1553define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
1554; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg:
1555; GFX6:       ; %bb.0: ; %entry
1556; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1557; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1558; GFX6-NEXT:    s_mov_b32 m0, -1
1559; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1561; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1562; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1563; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1565; GFX6-NEXT:    s_endpgm
1566;
1567; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg:
1568; GFX7:       ; %bb.0: ; %entry
1569; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1570; GFX7-NEXT:    s_mov_b32 m0, -1
1571; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1573; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1574; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1575; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1577; GFX7-NEXT:    s_endpgm
1578;
1579; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
1580; GFX10-WGP:       ; %bb.0: ; %entry
1581; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1582; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1584; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1585; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1586; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1587; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1588; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1589; GFX10-WGP-NEXT:    s_endpgm
1590;
1591; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
1592; GFX10-CU:       ; %bb.0: ; %entry
1593; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1594; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1596; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1597; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1598; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1600; GFX10-CU-NEXT:    s_endpgm
1601;
1602; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_cmpxchg:
1603; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1604; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1605; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1606; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1607; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1608; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1609; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1610; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1611; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1612; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1613; SKIP-CACHE-INV-NEXT:    s_endpgm
1614;
1615; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
1616; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1617; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1618; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1620; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1621; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1622; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1623; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1624; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1625;
1626; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
1627; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1628; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1629; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1630; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1631; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1632; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1633; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1634; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1635; GFX90A-TGSPLIT-NEXT:    s_endpgm
1636    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1637entry:
1638  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1639  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
1640  ret void
1641}
1642
1643define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
1644; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1645; GFX6:       ; %bb.0: ; %entry
1646; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1647; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1648; GFX6-NEXT:    s_mov_b32 m0, -1
1649; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1650; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1651; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1652; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1653; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1654; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1655; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX6-NEXT:    s_endpgm
1657;
1658; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1659; GFX7:       ; %bb.0: ; %entry
1660; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1661; GFX7-NEXT:    s_mov_b32 m0, -1
1662; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1664; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1665; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1666; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1668; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX7-NEXT:    s_endpgm
1670;
1671; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1672; GFX10-WGP:       ; %bb.0: ; %entry
1673; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1674; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1675; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1676; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1677; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1678; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1679; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1680; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1681; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1682; GFX10-WGP-NEXT:    buffer_gl0_inv
1683; GFX10-WGP-NEXT:    s_endpgm
1684;
1685; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1686; GFX10-CU:       ; %bb.0: ; %entry
1687; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1688; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1690; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1691; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1692; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1694; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX10-CU-NEXT:    s_endpgm
1696;
1697; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1698; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1699; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1700; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1701; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1702; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1703; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1705; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1706; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1707; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1708; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1709; SKIP-CACHE-INV-NEXT:    s_endpgm
1710;
1711; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1712; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1713; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1714; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1715; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1716; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1717; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1718; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1720; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1721; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1722;
1723; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
1724; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1725; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1726; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1728; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1729; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1730; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1731; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1732; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1733; GFX90A-TGSPLIT-NEXT:    s_endpgm
1734    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1735entry:
1736  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1737  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
1738  ret void
1739}
1740
1741define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
1742; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1743; GFX6:       ; %bb.0: ; %entry
1744; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1745; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1746; GFX6-NEXT:    s_mov_b32 m0, -1
1747; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1748; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1749; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1750; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1751; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1753; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1754; GFX6-NEXT:    s_endpgm
1755;
1756; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1757; GFX7:       ; %bb.0: ; %entry
1758; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1759; GFX7-NEXT:    s_mov_b32 m0, -1
1760; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1761; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1762; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1763; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1764; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1765; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1766; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1767; GFX7-NEXT:    s_endpgm
1768;
1769; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1770; GFX10-WGP:       ; %bb.0: ; %entry
1771; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1772; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1774; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1775; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1776; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1777; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1778; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1779; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1780; GFX10-WGP-NEXT:    buffer_gl0_inv
1781; GFX10-WGP-NEXT:    s_endpgm
1782;
1783; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1784; GFX10-CU:       ; %bb.0: ; %entry
1785; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1786; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1788; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1789; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1790; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1792; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX10-CU-NEXT:    s_endpgm
1794;
1795; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1796; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1797; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1798; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1799; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1804; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1805; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1806; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1807; SKIP-CACHE-INV-NEXT:    s_endpgm
1808;
1809; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1810; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1811; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1812; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1813; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1814; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1815; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1816; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1817; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1818; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1819; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1820;
1821; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
1822; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1823; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1824; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1826; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1827; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1828; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1829; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1830; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1831; GFX90A-TGSPLIT-NEXT:    s_endpgm
1832    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1833entry:
1834  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1835  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
1836  ret void
1837}
1838
1839define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
1840; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1841; GFX6:       ; %bb.0: ; %entry
1842; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1843; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1844; GFX6-NEXT:    s_mov_b32 m0, -1
1845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1846; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1847; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1848; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1849; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1850; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1851; GFX6-NEXT:    s_endpgm
1852;
1853; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1854; GFX7:       ; %bb.0: ; %entry
1855; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1856; GFX7-NEXT:    s_mov_b32 m0, -1
1857; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1858; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1859; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1860; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1861; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1862; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX7-NEXT:    s_endpgm
1864;
1865; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1866; GFX10-WGP:       ; %bb.0: ; %entry
1867; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1868; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1869; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1870; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1871; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1872; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1873; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX10-WGP-NEXT:    buffer_gl0_inv
1875; GFX10-WGP-NEXT:    s_endpgm
1876;
1877; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1878; GFX10-CU:       ; %bb.0: ; %entry
1879; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1880; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1881; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1882; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1883; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1884; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1885; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX10-CU-NEXT:    s_endpgm
1887;
1888; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1889; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1890; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1891; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1892; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1893; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1895; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1896; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1897; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1898; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1899; SKIP-CACHE-INV-NEXT:    s_endpgm
1900;
1901; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1902; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1903; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1904; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1906; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1907; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1908; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1909; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1911;
1912; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
1913; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1914; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1915; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1916; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1917; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1918; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1919; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1920; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1921; GFX90A-TGSPLIT-NEXT:    s_endpgm
1922    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1923entry:
1924  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1925  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
1926  ret void
1927}
1928
1929define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
1930; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg:
1931; GFX6:       ; %bb.0: ; %entry
1932; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1933; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1934; GFX6-NEXT:    s_mov_b32 m0, -1
1935; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1936; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1937; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1938; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1939; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1940; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1941; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1942; GFX6-NEXT:    s_endpgm
1943;
1944; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg:
1945; GFX7:       ; %bb.0: ; %entry
1946; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1947; GFX7-NEXT:    s_mov_b32 m0, -1
1948; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1950; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1951; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1952; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1953; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1954; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX7-NEXT:    s_endpgm
1956;
1957; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
1958; GFX10-WGP:       ; %bb.0: ; %entry
1959; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1960; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1961; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1962; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1963; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1964; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1965; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1966; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1967; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX10-WGP-NEXT:    buffer_gl0_inv
1969; GFX10-WGP-NEXT:    s_endpgm
1970;
1971; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
1972; GFX10-CU:       ; %bb.0: ; %entry
1973; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1974; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1975; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1976; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1977; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1978; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1980; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1981; GFX10-CU-NEXT:    s_endpgm
1982;
1983; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_cmpxchg:
1984; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1985; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1986; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1987; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1988; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1989; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1991; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1992; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1993; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1994; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1995; SKIP-CACHE-INV-NEXT:    s_endpgm
1996;
1997; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
1998; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1999; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2002; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2003; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2004; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2005; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2006; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2008;
2009; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
2010; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2011; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2012; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2013; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2014; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2015; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2016; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2017; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2018; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2019; GFX90A-TGSPLIT-NEXT:    s_endpgm
2020    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2021entry:
2022  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2023  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
2024  ret void
2025}
2026
2027define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
2028; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2029; GFX6:       ; %bb.0: ; %entry
2030; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2031; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2032; GFX6-NEXT:    s_mov_b32 m0, -1
2033; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2034; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2035; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2036; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2037; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2038; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2039; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2040; GFX6-NEXT:    s_endpgm
2041;
2042; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2043; GFX7:       ; %bb.0: ; %entry
2044; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2045; GFX7-NEXT:    s_mov_b32 m0, -1
2046; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2047; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2048; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2049; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2050; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2051; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2052; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX7-NEXT:    s_endpgm
2054;
2055; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2056; GFX10-WGP:       ; %bb.0: ; %entry
2057; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2058; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2059; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2060; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2061; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2062; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2063; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2064; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2065; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2066; GFX10-WGP-NEXT:    buffer_gl0_inv
2067; GFX10-WGP-NEXT:    s_endpgm
2068;
2069; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2070; GFX10-CU:       ; %bb.0: ; %entry
2071; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2072; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2074; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2075; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2076; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2077; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2078; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX10-CU-NEXT:    s_endpgm
2080;
2081; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2082; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2083; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2084; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2085; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2086; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2088; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2090; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2091; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2092; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2093; SKIP-CACHE-INV-NEXT:    s_endpgm
2094;
2095; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2096; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2097; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2098; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2099; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2100; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2101; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2102; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2104; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2105; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2106;
2107; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
2108; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2109; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2110; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2111; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2112; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2113; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2114; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2115; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2116; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2117; GFX90A-TGSPLIT-NEXT:    s_endpgm
2118    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2119entry:
2120  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2121  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
2122  ret void
2123}
2124
2125define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
2126; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2127; GFX6:       ; %bb.0: ; %entry
2128; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2129; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2130; GFX6-NEXT:    s_mov_b32 m0, -1
2131; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2133; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2134; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2135; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2137; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX6-NEXT:    s_endpgm
2139;
2140; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2141; GFX7:       ; %bb.0: ; %entry
2142; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2143; GFX7-NEXT:    s_mov_b32 m0, -1
2144; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2145; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2146; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2147; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2148; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2149; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2150; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2151; GFX7-NEXT:    s_endpgm
2152;
2153; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2154; GFX10-WGP:       ; %bb.0: ; %entry
2155; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2156; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2157; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2158; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2159; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2160; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2161; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2162; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2163; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2164; GFX10-WGP-NEXT:    buffer_gl0_inv
2165; GFX10-WGP-NEXT:    s_endpgm
2166;
2167; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2168; GFX10-CU:       ; %bb.0: ; %entry
2169; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2170; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2171; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2172; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2173; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2174; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2175; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2176; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2177; GFX10-CU-NEXT:    s_endpgm
2178;
2179; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2180; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2181; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2182; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2183; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2184; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2185; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2186; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2187; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2188; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2189; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2190; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2191; SKIP-CACHE-INV-NEXT:    s_endpgm
2192;
2193; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2194; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2195; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2196; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2197; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2198; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2199; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2200; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2201; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2204;
2205; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
2206; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2207; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2208; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2209; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2210; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2211; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2212; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2213; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2214; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2215; GFX90A-TGSPLIT-NEXT:    s_endpgm
2216    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2217entry:
2218  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2219  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
2220  ret void
2221}
2222
2223define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
2224; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2225; GFX6:       ; %bb.0: ; %entry
2226; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2227; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2228; GFX6-NEXT:    s_mov_b32 m0, -1
2229; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2230; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2231; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2232; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2233; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2235; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2236; GFX6-NEXT:    s_endpgm
2237;
2238; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2239; GFX7:       ; %bb.0: ; %entry
2240; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2241; GFX7-NEXT:    s_mov_b32 m0, -1
2242; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2244; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2245; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2246; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2247; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2248; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2249; GFX7-NEXT:    s_endpgm
2250;
2251; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2252; GFX10-WGP:       ; %bb.0: ; %entry
2253; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2254; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2255; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2256; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2257; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2258; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2259; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2260; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2261; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2262; GFX10-WGP-NEXT:    buffer_gl0_inv
2263; GFX10-WGP-NEXT:    s_endpgm
2264;
2265; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2266; GFX10-CU:       ; %bb.0: ; %entry
2267; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2268; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2269; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2270; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2271; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2272; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2273; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2274; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2275; GFX10-CU-NEXT:    s_endpgm
2276;
2277; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2278; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2279; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2280; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2281; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2282; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2283; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2285; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2286; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2287; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2288; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2289; SKIP-CACHE-INV-NEXT:    s_endpgm
2290;
2291; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2292; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2293; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2294; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2295; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2296; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2297; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2298; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2300; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2301; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2302;
2303; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
2304; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2305; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2306; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2308; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2309; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2310; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2311; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2312; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2313; GFX90A-TGSPLIT-NEXT:    s_endpgm
2314    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2315entry:
2316  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2317  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
2318  ret void
2319}
2320
2321define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
2322; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2323; GFX6:       ; %bb.0: ; %entry
2324; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2325; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2326; GFX6-NEXT:    s_mov_b32 m0, -1
2327; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2328; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2329; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2330; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2331; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2332; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2333; GFX6-NEXT:    ds_write_b32 v0, v1
2334; GFX6-NEXT:    s_endpgm
2335;
2336; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2337; GFX7:       ; %bb.0: ; %entry
2338; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2339; GFX7-NEXT:    s_mov_b32 m0, -1
2340; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2341; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2342; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2343; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2344; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2345; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX7-NEXT:    ds_write_b32 v0, v1
2347; GFX7-NEXT:    s_endpgm
2348;
2349; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2350; GFX10-WGP:       ; %bb.0: ; %entry
2351; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2352; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2353; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2354; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2355; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2356; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2357; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX10-WGP-NEXT:    buffer_gl0_inv
2359; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2360; GFX10-WGP-NEXT:    s_endpgm
2361;
2362; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2363; GFX10-CU:       ; %bb.0: ; %entry
2364; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2365; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2366; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2367; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2368; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2369; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2370; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2372; GFX10-CU-NEXT:    s_endpgm
2373;
2374; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2375; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2376; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2377; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2378; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2379; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2380; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2381; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2382; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2383; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2384; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2385; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2386; SKIP-CACHE-INV-NEXT:    s_endpgm
2387;
2388; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2389; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2390; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2391; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2392; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2393; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2394; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2395; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2396; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2397; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2398; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2399;
2400; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
2401; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2402; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2403; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2404; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2405; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2406; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2407; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2408; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2409; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2410; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2411; GFX90A-TGSPLIT-NEXT:    s_endpgm
2412    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2413entry:
2414  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2415  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
2416  %val0 = extractvalue { i32, i1 } %val, 0
2417  store i32 %val0, i32 addrspace(3)* %out, align 4
2418  ret void
2419}
2420
2421define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
2422; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2423; GFX6:       ; %bb.0: ; %entry
2424; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2425; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2426; GFX6-NEXT:    s_mov_b32 m0, -1
2427; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2428; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2429; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2430; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2431; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2433; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2434; GFX6-NEXT:    ds_write_b32 v0, v1
2435; GFX6-NEXT:    s_endpgm
2436;
2437; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2438; GFX7:       ; %bb.0: ; %entry
2439; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2440; GFX7-NEXT:    s_mov_b32 m0, -1
2441; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2442; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2443; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2444; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2445; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2446; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2447; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2448; GFX7-NEXT:    ds_write_b32 v0, v1
2449; GFX7-NEXT:    s_endpgm
2450;
2451; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2452; GFX10-WGP:       ; %bb.0: ; %entry
2453; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2454; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2455; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2456; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2457; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2458; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2459; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2460; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2461; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2462; GFX10-WGP-NEXT:    buffer_gl0_inv
2463; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2464; GFX10-WGP-NEXT:    s_endpgm
2465;
2466; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2467; GFX10-CU:       ; %bb.0: ; %entry
2468; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2469; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2470; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2471; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2472; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2473; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2474; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2475; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2476; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2477; GFX10-CU-NEXT:    s_endpgm
2478;
2479; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2480; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2481; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2482; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2483; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2484; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2487; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2488; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2489; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2490; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2491; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2492; SKIP-CACHE-INV-NEXT:    s_endpgm
2493;
2494; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2495; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2496; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2497; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2499; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2500; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2501; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2502; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2503; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2504; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2505; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2506;
2507; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
2508; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2509; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2510; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2512; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2513; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2514; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2515; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2516; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2517; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2518; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2519; GFX90A-TGSPLIT-NEXT:    s_endpgm
2520    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2521entry:
2522  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2523  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
2524  %val0 = extractvalue { i32, i1 } %val, 0
2525  store i32 %val0, i32 addrspace(3)* %out, align 4
2526  ret void
2527}
2528
2529define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
2530; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2531; GFX6:       ; %bb.0: ; %entry
2532; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2533; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2534; GFX6-NEXT:    s_mov_b32 m0, -1
2535; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2536; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2537; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2538; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2539; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2540; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2541; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX6-NEXT:    ds_write_b32 v0, v1
2543; GFX6-NEXT:    s_endpgm
2544;
2545; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2546; GFX7:       ; %bb.0: ; %entry
2547; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2548; GFX7-NEXT:    s_mov_b32 m0, -1
2549; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2550; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2551; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2552; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2553; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2554; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2555; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2556; GFX7-NEXT:    ds_write_b32 v0, v1
2557; GFX7-NEXT:    s_endpgm
2558;
2559; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2560; GFX10-WGP:       ; %bb.0: ; %entry
2561; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2562; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2563; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2564; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2565; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2566; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2567; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2568; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2569; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2570; GFX10-WGP-NEXT:    buffer_gl0_inv
2571; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2572; GFX10-WGP-NEXT:    s_endpgm
2573;
2574; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2575; GFX10-CU:       ; %bb.0: ; %entry
2576; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2577; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2578; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2579; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2580; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2581; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2583; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2584; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2585; GFX10-CU-NEXT:    s_endpgm
2586;
2587; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2588; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2589; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2590; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2591; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2592; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2593; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2595; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2596; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2597; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2598; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2599; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2600; SKIP-CACHE-INV-NEXT:    s_endpgm
2601;
2602; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2603; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2604; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2605; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2606; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2607; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2608; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2609; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2610; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2611; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2612; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2613; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2614;
2615; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
2616; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2617; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2618; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2619; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2620; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2621; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2622; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2623; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2624; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2625; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2626; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2627; GFX90A-TGSPLIT-NEXT:    s_endpgm
2628    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2629entry:
2630  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2631  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
2632  %val0 = extractvalue { i32, i1 } %val, 0
2633  store i32 %val0, i32 addrspace(3)* %out, align 4
2634  ret void
2635}
2636
2637define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
2638; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2639; GFX6:       ; %bb.0: ; %entry
2640; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2641; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2642; GFX6-NEXT:    s_mov_b32 m0, -1
2643; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2644; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2645; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2646; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2647; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2648; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX6-NEXT:    ds_write_b32 v0, v1
2650; GFX6-NEXT:    s_endpgm
2651;
2652; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2653; GFX7:       ; %bb.0: ; %entry
2654; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2655; GFX7-NEXT:    s_mov_b32 m0, -1
2656; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2657; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2658; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2659; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2660; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2661; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2662; GFX7-NEXT:    ds_write_b32 v0, v1
2663; GFX7-NEXT:    s_endpgm
2664;
2665; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2666; GFX10-WGP:       ; %bb.0: ; %entry
2667; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2668; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2669; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2670; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2671; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2672; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2673; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX10-WGP-NEXT:    buffer_gl0_inv
2675; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2676; GFX10-WGP-NEXT:    s_endpgm
2677;
2678; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2679; GFX10-CU:       ; %bb.0: ; %entry
2680; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2681; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2682; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2683; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2684; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2685; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2686; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2687; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2688; GFX10-CU-NEXT:    s_endpgm
2689;
2690; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2691; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2692; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2693; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2694; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2695; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2697; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2698; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2699; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2700; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2701; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2702; SKIP-CACHE-INV-NEXT:    s_endpgm
2703;
2704; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2705; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2706; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2707; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2708; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2709; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2710; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2711; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2712; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2713; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2714; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2715;
2716; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
2717; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2718; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2719; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2720; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2721; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2722; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2723; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2724; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2725; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2727; GFX90A-TGSPLIT-NEXT:    s_endpgm
2728    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2729entry:
2730  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2731  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
2732  %val0 = extractvalue { i32, i1 } %val, 0
2733  store i32 %val0, i32 addrspace(3)* %out, align 4
2734  ret void
2735}
2736
2737define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
2738; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2739; GFX6:       ; %bb.0: ; %entry
2740; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2741; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2742; GFX6-NEXT:    s_mov_b32 m0, -1
2743; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2744; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2745; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2746; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2747; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2748; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2749; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX6-NEXT:    ds_write_b32 v0, v1
2751; GFX6-NEXT:    s_endpgm
2752;
2753; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2754; GFX7:       ; %bb.0: ; %entry
2755; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2756; GFX7-NEXT:    s_mov_b32 m0, -1
2757; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2758; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2759; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2760; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2761; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2762; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2763; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2764; GFX7-NEXT:    ds_write_b32 v0, v1
2765; GFX7-NEXT:    s_endpgm
2766;
2767; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2768; GFX10-WGP:       ; %bb.0: ; %entry
2769; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2770; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2771; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2772; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2773; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2774; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2775; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2776; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2777; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2778; GFX10-WGP-NEXT:    buffer_gl0_inv
2779; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2780; GFX10-WGP-NEXT:    s_endpgm
2781;
2782; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2783; GFX10-CU:       ; %bb.0: ; %entry
2784; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2785; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2786; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2787; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2788; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2789; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2790; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2791; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2792; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2793; GFX10-CU-NEXT:    s_endpgm
2794;
2795; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2796; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2797; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2798; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2799; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2804; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2805; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2806; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2807; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2808; SKIP-CACHE-INV-NEXT:    s_endpgm
2809;
2810; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2811; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2812; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2813; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2814; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2815; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2816; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2817; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2818; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2819; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2820; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2821; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2822;
2823; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
2824; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2825; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2826; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2828; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2829; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2830; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2831; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2832; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2833; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2834; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2835; GFX90A-TGSPLIT-NEXT:    s_endpgm
2836    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2837entry:
2838  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2839  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
2840  %val0 = extractvalue { i32, i1 } %val, 0
2841  store i32 %val0, i32 addrspace(3)* %out, align 4
2842  ret void
2843}
2844
2845define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
2846; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2847; GFX6:       ; %bb.0: ; %entry
2848; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2849; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2850; GFX6-NEXT:    s_mov_b32 m0, -1
2851; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2852; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2853; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2854; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2855; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2856; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2857; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2858; GFX6-NEXT:    ds_write_b32 v0, v1
2859; GFX6-NEXT:    s_endpgm
2860;
2861; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2862; GFX7:       ; %bb.0: ; %entry
2863; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2864; GFX7-NEXT:    s_mov_b32 m0, -1
2865; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2867; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2868; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2869; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2870; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2871; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2872; GFX7-NEXT:    ds_write_b32 v0, v1
2873; GFX7-NEXT:    s_endpgm
2874;
2875; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2876; GFX10-WGP:       ; %bb.0: ; %entry
2877; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2878; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2879; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2880; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2881; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2882; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2883; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2884; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2885; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2886; GFX10-WGP-NEXT:    buffer_gl0_inv
2887; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2888; GFX10-WGP-NEXT:    s_endpgm
2889;
2890; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2891; GFX10-CU:       ; %bb.0: ; %entry
2892; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2893; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2895; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2896; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2897; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2899; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2901; GFX10-CU-NEXT:    s_endpgm
2902;
2903; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2904; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2905; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2906; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2907; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2908; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2909; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2910; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2911; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2912; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2913; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2914; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2915; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2916; SKIP-CACHE-INV-NEXT:    s_endpgm
2917;
2918; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2919; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2920; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2921; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2922; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2923; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2924; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2925; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2926; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2927; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2928; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2929; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2930;
2931; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
2932; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2933; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2934; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2935; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2936; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2937; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2938; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2939; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2940; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2941; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2943; GFX90A-TGSPLIT-NEXT:    s_endpgm
2944    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2945entry:
2946  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2947  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
2948  %val0 = extractvalue { i32, i1 } %val, 0
2949  store i32 %val0, i32 addrspace(3)* %out, align 4
2950  ret void
2951}
2952
2953define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
2954; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
2955; GFX6:       ; %bb.0: ; %entry
2956; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2957; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2958; GFX6-NEXT:    s_mov_b32 m0, -1
2959; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2960; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2961; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2962; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2963; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2964; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2965; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2966; GFX6-NEXT:    ds_write_b32 v0, v1
2967; GFX6-NEXT:    s_endpgm
2968;
2969; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
2970; GFX7:       ; %bb.0: ; %entry
2971; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2972; GFX7-NEXT:    s_mov_b32 m0, -1
2973; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2974; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2975; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2976; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2977; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2978; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2979; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX7-NEXT:    ds_write_b32 v0, v1
2981; GFX7-NEXT:    s_endpgm
2982;
2983; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
2984; GFX10-WGP:       ; %bb.0: ; %entry
2985; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2986; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2988; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2989; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2990; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2991; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2992; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2993; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2994; GFX10-WGP-NEXT:    buffer_gl0_inv
2995; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2996; GFX10-WGP-NEXT:    s_endpgm
2997;
2998; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
2999; GFX10-CU:       ; %bb.0: ; %entry
3000; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3001; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3002; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3003; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3004; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3005; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3006; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3007; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3008; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3009; GFX10-CU-NEXT:    s_endpgm
3010;
3011; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
3012; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3013; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3014; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3015; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3016; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3017; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3018; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3019; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3020; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3021; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3022; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3023; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3024; SKIP-CACHE-INV-NEXT:    s_endpgm
3025;
3026; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
3027; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3028; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3029; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3030; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3031; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3032; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
3033; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3035; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3036; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3037; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3038;
3039; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
3040; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3041; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3042; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3043; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3044; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3045; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
3046; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3047; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3048; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3049; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3051; GFX90A-TGSPLIT-NEXT:    s_endpgm
3052    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3053entry:
3054  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3055  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
3056  %val0 = extractvalue { i32, i1 } %val, 0
3057  store i32 %val0, i32 addrspace(3)* %out, align 4
3058  ret void
3059}
3060
3061define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
3062; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3063; GFX6:       ; %bb.0: ; %entry
3064; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
3065; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
3066; GFX6-NEXT:    s_mov_b32 m0, -1
3067; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3068; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3069; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3070; GFX6-NEXT:    v_mov_b32_e32 v2, s0
3071; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3072; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3073; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3074; GFX6-NEXT:    ds_write_b32 v0, v1
3075; GFX6-NEXT:    s_endpgm
3076;
3077; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3078; GFX7:       ; %bb.0: ; %entry
3079; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3080; GFX7-NEXT:    s_mov_b32 m0, -1
3081; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3082; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3083; GFX7-NEXT:    v_mov_b32_e32 v1, s2
3084; GFX7-NEXT:    v_mov_b32_e32 v2, s1
3085; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3086; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3087; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3088; GFX7-NEXT:    ds_write_b32 v0, v1
3089; GFX7-NEXT:    s_endpgm
3090;
3091; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3092; GFX10-WGP:       ; %bb.0: ; %entry
3093; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3094; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3095; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3096; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3097; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
3098; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3099; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3100; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3101; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3102; GFX10-WGP-NEXT:    buffer_gl0_inv
3103; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3104; GFX10-WGP-NEXT:    s_endpgm
3105;
3106; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3107; GFX10-CU:       ; %bb.0: ; %entry
3108; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3109; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3111; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3112; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
3113; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3114; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3115; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3116; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3117; GFX10-CU-NEXT:    s_endpgm
3118;
3119; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3120; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3121; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3122; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3123; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3124; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3125; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3126; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3127; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3128; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3129; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3130; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3131; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3132; SKIP-CACHE-INV-NEXT:    s_endpgm
3133;
3134; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3135; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3136; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3137; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3138; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3139; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3140; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
3141; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3143; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3145; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3146;
3147; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
3148; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3149; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3150; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3151; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3152; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3153; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
3154; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3155; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
3156; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3157; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3158; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3159; GFX90A-TGSPLIT-NEXT:    s_endpgm
3160    i32 addrspace(3)* %out, i32 %in, i32 %old) {
3161entry:
3162  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
3163  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
3164  %val0 = extractvalue { i32, i1 } %val, 0
3165  store i32 %val0, i32 addrspace(3)* %out, align 4
3166  ret void
3167}
3168
3169define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
3170; GFX6-LABEL: local_workgroup_one_as_unordered_load:
3171; GFX6:       ; %bb.0: ; %entry
3172; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3173; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3174; GFX6-NEXT:    s_mov_b32 m0, -1
3175; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3176; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3177; GFX6-NEXT:    ds_read_b32 v0, v0
3178; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3179; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3180; GFX6-NEXT:    ds_write_b32 v1, v0
3181; GFX6-NEXT:    s_endpgm
3182;
3183; GFX7-LABEL: local_workgroup_one_as_unordered_load:
3184; GFX7:       ; %bb.0: ; %entry
3185; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3186; GFX7-NEXT:    s_mov_b32 m0, -1
3187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3188; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3189; GFX7-NEXT:    ds_read_b32 v0, v0
3190; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3191; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3192; GFX7-NEXT:    ds_write_b32 v1, v0
3193; GFX7-NEXT:    s_endpgm
3194;
3195; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load:
3196; GFX10-WGP:       ; %bb.0: ; %entry
3197; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3198; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3199; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3200; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3201; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3202; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3203; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3204; GFX10-WGP-NEXT:    s_endpgm
3205;
3206; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load:
3207; GFX10-CU:       ; %bb.0: ; %entry
3208; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3209; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3211; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3212; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3213; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3214; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3215; GFX10-CU-NEXT:    s_endpgm
3216;
3217; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load:
3218; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3219; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3220; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3221; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3223; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3225; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3226; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3227; SKIP-CACHE-INV-NEXT:    s_endpgm
3228;
3229; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
3230; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3231; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3232; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3234; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3235; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3236; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3237; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3238; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3239;
3240; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
3241; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3242; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3243; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3245; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3246; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3247; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3248; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3249; GFX90A-TGSPLIT-NEXT:    s_endpgm
3250    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3251entry:
3252  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4
3253  store i32 %val, i32 addrspace(3)* %out
3254  ret void
3255}
3256
3257define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
3258; GFX6-LABEL: local_workgroup_one_as_monotonic_load:
3259; GFX6:       ; %bb.0: ; %entry
3260; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3261; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3262; GFX6-NEXT:    s_mov_b32 m0, -1
3263; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3264; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3265; GFX6-NEXT:    ds_read_b32 v0, v0
3266; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3267; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3268; GFX6-NEXT:    ds_write_b32 v1, v0
3269; GFX6-NEXT:    s_endpgm
3270;
3271; GFX7-LABEL: local_workgroup_one_as_monotonic_load:
3272; GFX7:       ; %bb.0: ; %entry
3273; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3274; GFX7-NEXT:    s_mov_b32 m0, -1
3275; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3276; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3277; GFX7-NEXT:    ds_read_b32 v0, v0
3278; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3279; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3280; GFX7-NEXT:    ds_write_b32 v1, v0
3281; GFX7-NEXT:    s_endpgm
3282;
3283; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load:
3284; GFX10-WGP:       ; %bb.0: ; %entry
3285; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3286; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3287; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3288; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3289; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3290; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3291; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3292; GFX10-WGP-NEXT:    s_endpgm
3293;
3294; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load:
3295; GFX10-CU:       ; %bb.0: ; %entry
3296; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3297; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3298; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3299; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3300; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3302; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3303; GFX10-CU-NEXT:    s_endpgm
3304;
3305; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load:
3306; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3307; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3308; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3309; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3311; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3313; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3314; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3315; SKIP-CACHE-INV-NEXT:    s_endpgm
3316;
3317; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
3318; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3319; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3320; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3321; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3322; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3323; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3324; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3325; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3326; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3327;
3328; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
3329; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3330; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3331; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3333; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3334; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3335; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3336; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3337; GFX90A-TGSPLIT-NEXT:    s_endpgm
3338    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3339entry:
3340  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4
3341  store i32 %val, i32 addrspace(3)* %out
3342  ret void
3343}
3344
3345define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
3346; GFX6-LABEL: local_workgroup_one_as_acquire_load:
3347; GFX6:       ; %bb.0: ; %entry
3348; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3349; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3350; GFX6-NEXT:    s_mov_b32 m0, -1
3351; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3352; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3353; GFX6-NEXT:    ds_read_b32 v0, v0
3354; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3355; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3356; GFX6-NEXT:    ds_write_b32 v1, v0
3357; GFX6-NEXT:    s_endpgm
3358;
3359; GFX7-LABEL: local_workgroup_one_as_acquire_load:
3360; GFX7:       ; %bb.0: ; %entry
3361; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3362; GFX7-NEXT:    s_mov_b32 m0, -1
3363; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3364; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3365; GFX7-NEXT:    ds_read_b32 v0, v0
3366; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3367; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3368; GFX7-NEXT:    ds_write_b32 v1, v0
3369; GFX7-NEXT:    s_endpgm
3370;
3371; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load:
3372; GFX10-WGP:       ; %bb.0: ; %entry
3373; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3374; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3375; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3376; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3377; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3378; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3379; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3380; GFX10-WGP-NEXT:    s_endpgm
3381;
3382; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load:
3383; GFX10-CU:       ; %bb.0: ; %entry
3384; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3385; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3388; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3389; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3390; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3391; GFX10-CU-NEXT:    s_endpgm
3392;
3393; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load:
3394; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3395; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3396; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3397; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3398; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3399; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3401; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3402; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3403; SKIP-CACHE-INV-NEXT:    s_endpgm
3404;
3405; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
3406; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3407; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3408; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3409; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3410; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3411; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3412; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3413; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3414; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3415;
3416; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
3417; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3418; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3419; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3420; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3421; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3422; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3423; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3424; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3425; GFX90A-TGSPLIT-NEXT:    s_endpgm
3426    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3427entry:
3428  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4
3429  store i32 %val, i32 addrspace(3)* %out
3430  ret void
3431}
3432
3433define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
3434; GFX6-LABEL: local_workgroup_one_as_seq_cst_load:
3435; GFX6:       ; %bb.0: ; %entry
3436; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3437; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3438; GFX6-NEXT:    s_mov_b32 m0, -1
3439; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3440; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3441; GFX6-NEXT:    ds_read_b32 v0, v0
3442; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3443; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3444; GFX6-NEXT:    ds_write_b32 v1, v0
3445; GFX6-NEXT:    s_endpgm
3446;
3447; GFX7-LABEL: local_workgroup_one_as_seq_cst_load:
3448; GFX7:       ; %bb.0: ; %entry
3449; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3450; GFX7-NEXT:    s_mov_b32 m0, -1
3451; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3452; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3453; GFX7-NEXT:    ds_read_b32 v0, v0
3454; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3455; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3456; GFX7-NEXT:    ds_write_b32 v1, v0
3457; GFX7-NEXT:    s_endpgm
3458;
3459; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load:
3460; GFX10-WGP:       ; %bb.0: ; %entry
3461; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3462; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3464; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3465; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3466; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3467; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3468; GFX10-WGP-NEXT:    s_endpgm
3469;
3470; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load:
3471; GFX10-CU:       ; %bb.0: ; %entry
3472; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3473; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3474; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3475; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3476; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3477; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3478; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3479; GFX10-CU-NEXT:    s_endpgm
3480;
3481; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load:
3482; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3483; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3484; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3485; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3487; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3489; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3490; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3491; SKIP-CACHE-INV-NEXT:    s_endpgm
3492;
3493; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
3494; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3495; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3496; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3497; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3498; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3499; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3500; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3501; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3503;
3504; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
3505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3507; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3509; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3511; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3512; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3513; GFX90A-TGSPLIT-NEXT:    s_endpgm
3514    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3515entry:
3516  %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4
3517  store i32 %val, i32 addrspace(3)* %out
3518  ret void
3519}
3520
3521define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
3522; GFX6-LABEL: local_workgroup_one_as_unordered_store:
3523; GFX6:       ; %bb.0: ; %entry
3524; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3525; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3526; GFX6-NEXT:    s_mov_b32 m0, -1
3527; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3529; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3530; GFX6-NEXT:    ds_write_b32 v0, v1
3531; GFX6-NEXT:    s_endpgm
3532;
3533; GFX7-LABEL: local_workgroup_one_as_unordered_store:
3534; GFX7:       ; %bb.0: ; %entry
3535; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3536; GFX7-NEXT:    s_mov_b32 m0, -1
3537; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3538; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3539; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3540; GFX7-NEXT:    ds_write_b32 v0, v1
3541; GFX7-NEXT:    s_endpgm
3542;
3543; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store:
3544; GFX10-WGP:       ; %bb.0: ; %entry
3545; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3546; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3547; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3548; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3549; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3550; GFX10-WGP-NEXT:    s_endpgm
3551;
3552; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store:
3553; GFX10-CU:       ; %bb.0: ; %entry
3554; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3555; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3556; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3557; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3558; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3559; GFX10-CU-NEXT:    s_endpgm
3560;
3561; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store:
3562; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3563; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3564; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3565; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3566; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3568; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3569; SKIP-CACHE-INV-NEXT:    s_endpgm
3570;
3571; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
3572; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3573; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3574; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3575; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3576; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3577; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3578; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3579;
3580; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
3581; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3582; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3583; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3585; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3586; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3587; GFX90A-TGSPLIT-NEXT:    s_endpgm
3588    i32 %in, i32 addrspace(3)* %out) {
3589entry:
3590  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4
3591  ret void
3592}
3593
3594define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
3595; GFX6-LABEL: local_workgroup_one_as_monotonic_store:
3596; GFX6:       ; %bb.0: ; %entry
3597; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3598; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3599; GFX6-NEXT:    s_mov_b32 m0, -1
3600; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3601; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3602; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3603; GFX6-NEXT:    ds_write_b32 v0, v1
3604; GFX6-NEXT:    s_endpgm
3605;
3606; GFX7-LABEL: local_workgroup_one_as_monotonic_store:
3607; GFX7:       ; %bb.0: ; %entry
3608; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3609; GFX7-NEXT:    s_mov_b32 m0, -1
3610; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3611; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3612; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3613; GFX7-NEXT:    ds_write_b32 v0, v1
3614; GFX7-NEXT:    s_endpgm
3615;
3616; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store:
3617; GFX10-WGP:       ; %bb.0: ; %entry
3618; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3619; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3620; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3621; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3622; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3623; GFX10-WGP-NEXT:    s_endpgm
3624;
3625; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store:
3626; GFX10-CU:       ; %bb.0: ; %entry
3627; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3629; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3630; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3631; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3632; GFX10-CU-NEXT:    s_endpgm
3633;
3634; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store:
3635; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3636; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3637; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3638; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3639; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3640; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3641; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3642; SKIP-CACHE-INV-NEXT:    s_endpgm
3643;
3644; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
3645; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3646; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3647; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3649; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3650; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3651; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3652;
3653; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
3654; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3655; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3656; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3657; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3658; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3659; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3660; GFX90A-TGSPLIT-NEXT:    s_endpgm
3661    i32 %in, i32 addrspace(3)* %out) {
3662entry:
3663  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4
3664  ret void
3665}
3666
3667define amdgpu_kernel void @local_workgroup_one_as_release_store(
3668; GFX6-LABEL: local_workgroup_one_as_release_store:
3669; GFX6:       ; %bb.0: ; %entry
3670; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3671; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3672; GFX6-NEXT:    s_mov_b32 m0, -1
3673; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3674; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3675; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3676; GFX6-NEXT:    ds_write_b32 v0, v1
3677; GFX6-NEXT:    s_endpgm
3678;
3679; GFX7-LABEL: local_workgroup_one_as_release_store:
3680; GFX7:       ; %bb.0: ; %entry
3681; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3682; GFX7-NEXT:    s_mov_b32 m0, -1
3683; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3684; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3685; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3686; GFX7-NEXT:    ds_write_b32 v0, v1
3687; GFX7-NEXT:    s_endpgm
3688;
3689; GFX10-WGP-LABEL: local_workgroup_one_as_release_store:
3690; GFX10-WGP:       ; %bb.0: ; %entry
3691; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3692; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3693; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3694; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3695; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3696; GFX10-WGP-NEXT:    s_endpgm
3697;
3698; GFX10-CU-LABEL: local_workgroup_one_as_release_store:
3699; GFX10-CU:       ; %bb.0: ; %entry
3700; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3701; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3702; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3703; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3704; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3705; GFX10-CU-NEXT:    s_endpgm
3706;
3707; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store:
3708; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3709; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3710; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3711; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3714; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3715; SKIP-CACHE-INV-NEXT:    s_endpgm
3716;
3717; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store:
3718; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3719; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3720; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3721; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3722; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3723; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3724; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3725;
3726; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store:
3727; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3728; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3729; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3730; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3731; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3732; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3733; GFX90A-TGSPLIT-NEXT:    s_endpgm
3734    i32 %in, i32 addrspace(3)* %out) {
3735entry:
3736  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4
3737  ret void
3738}
3739
3740define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
3741; GFX6-LABEL: local_workgroup_one_as_seq_cst_store:
3742; GFX6:       ; %bb.0: ; %entry
3743; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3744; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3745; GFX6-NEXT:    s_mov_b32 m0, -1
3746; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3747; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3748; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3749; GFX6-NEXT:    ds_write_b32 v0, v1
3750; GFX6-NEXT:    s_endpgm
3751;
3752; GFX7-LABEL: local_workgroup_one_as_seq_cst_store:
3753; GFX7:       ; %bb.0: ; %entry
3754; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3755; GFX7-NEXT:    s_mov_b32 m0, -1
3756; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3757; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3758; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3759; GFX7-NEXT:    ds_write_b32 v0, v1
3760; GFX7-NEXT:    s_endpgm
3761;
3762; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store:
3763; GFX10-WGP:       ; %bb.0: ; %entry
3764; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3765; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3766; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3768; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3769; GFX10-WGP-NEXT:    s_endpgm
3770;
3771; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store:
3772; GFX10-CU:       ; %bb.0: ; %entry
3773; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3774; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3775; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3776; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3777; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3778; GFX10-CU-NEXT:    s_endpgm
3779;
3780; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store:
3781; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3782; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3783; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3784; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3787; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3788; SKIP-CACHE-INV-NEXT:    s_endpgm
3789;
3790; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
3791; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3792; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3793; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3794; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3795; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3796; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3797; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3798;
3799; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
3800; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3801; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3802; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3803; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3804; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3805; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3806; GFX90A-TGSPLIT-NEXT:    s_endpgm
3807    i32 %in, i32 addrspace(3)* %out) {
3808entry:
3809  store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4
3810  ret void
3811}
3812
3813define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
3814; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3815; GFX6:       ; %bb.0: ; %entry
3816; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3817; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3818; GFX6-NEXT:    s_mov_b32 m0, -1
3819; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3820; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3821; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3822; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3823; GFX6-NEXT:    s_endpgm
3824;
3825; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3826; GFX7:       ; %bb.0: ; %entry
3827; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3828; GFX7-NEXT:    s_mov_b32 m0, -1
3829; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3830; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3831; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3832; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3833; GFX7-NEXT:    s_endpgm
3834;
3835; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3836; GFX10-WGP:       ; %bb.0: ; %entry
3837; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3838; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3839; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3840; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3841; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3842; GFX10-WGP-NEXT:    s_endpgm
3843;
3844; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3845; GFX10-CU:       ; %bb.0: ; %entry
3846; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3847; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3848; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3849; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3850; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3851; GFX10-CU-NEXT:    s_endpgm
3852;
3853; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3854; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3855; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3856; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3857; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3860; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3861; SKIP-CACHE-INV-NEXT:    s_endpgm
3862;
3863; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3864; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3865; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3866; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3867; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3868; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3869; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3870; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3871;
3872; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
3873; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3874; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3875; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3876; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3877; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3878; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3879; GFX90A-TGSPLIT-NEXT:    s_endpgm
3880    i32 addrspace(3)* %out, i32 %in) {
3881entry:
3882  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic
3883  ret void
3884}
3885
3886define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
3887; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3888; GFX6:       ; %bb.0: ; %entry
3889; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3890; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3891; GFX6-NEXT:    s_mov_b32 m0, -1
3892; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3893; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3894; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3895; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3896; GFX6-NEXT:    s_endpgm
3897;
3898; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3899; GFX7:       ; %bb.0: ; %entry
3900; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3901; GFX7-NEXT:    s_mov_b32 m0, -1
3902; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3903; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3904; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3905; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3906; GFX7-NEXT:    s_endpgm
3907;
3908; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3909; GFX10-WGP:       ; %bb.0: ; %entry
3910; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3911; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3912; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3913; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3914; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3915; GFX10-WGP-NEXT:    s_endpgm
3916;
3917; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3918; GFX10-CU:       ; %bb.0: ; %entry
3919; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3920; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3921; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3922; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3923; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3924; GFX10-CU-NEXT:    s_endpgm
3925;
3926; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3927; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3928; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3929; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3930; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3931; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3932; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3933; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3934; SKIP-CACHE-INV-NEXT:    s_endpgm
3935;
3936; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3937; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3938; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3939; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3940; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3941; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3942; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3943; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3944;
3945; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
3946; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3947; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3948; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3949; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3950; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3951; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3952; GFX90A-TGSPLIT-NEXT:    s_endpgm
3953    i32 addrspace(3)* %out, i32 %in) {
3954entry:
3955  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
3956  ret void
3957}
3958
3959define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
3960; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw:
3961; GFX6:       ; %bb.0: ; %entry
3962; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3963; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3964; GFX6-NEXT:    s_mov_b32 m0, -1
3965; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3966; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3967; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3968; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3969; GFX6-NEXT:    s_endpgm
3970;
3971; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw:
3972; GFX7:       ; %bb.0: ; %entry
3973; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3974; GFX7-NEXT:    s_mov_b32 m0, -1
3975; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3976; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3977; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3978; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3979; GFX7-NEXT:    s_endpgm
3980;
3981; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw:
3982; GFX10-WGP:       ; %bb.0: ; %entry
3983; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3984; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3985; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3986; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3987; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3988; GFX10-WGP-NEXT:    s_endpgm
3989;
3990; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
3991; GFX10-CU:       ; %bb.0: ; %entry
3992; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3993; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3994; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3995; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3996; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3997; GFX10-CU-NEXT:    s_endpgm
3998;
3999; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw:
4000; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4001; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4002; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4003; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4004; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4006; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4007; SKIP-CACHE-INV-NEXT:    s_endpgm
4008;
4009; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
4010; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4011; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4012; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4013; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4014; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4015; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4016; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4017;
4018; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
4019; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4020; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4021; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4022; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4023; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4024; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4025; GFX90A-TGSPLIT-NEXT:    s_endpgm
4026    i32 addrspace(3)* %out, i32 %in) {
4027entry:
4028  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release
4029  ret void
4030}
4031
4032define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
4033; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4034; GFX6:       ; %bb.0: ; %entry
4035; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4036; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4037; GFX6-NEXT:    s_mov_b32 m0, -1
4038; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4039; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4040; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4041; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4042; GFX6-NEXT:    s_endpgm
4043;
4044; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4045; GFX7:       ; %bb.0: ; %entry
4046; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4047; GFX7-NEXT:    s_mov_b32 m0, -1
4048; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4049; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4050; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4051; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4052; GFX7-NEXT:    s_endpgm
4053;
4054; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4055; GFX10-WGP:       ; %bb.0: ; %entry
4056; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4057; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4058; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4059; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4060; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4061; GFX10-WGP-NEXT:    s_endpgm
4062;
4063; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4064; GFX10-CU:       ; %bb.0: ; %entry
4065; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4066; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4067; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4068; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4069; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4070; GFX10-CU-NEXT:    s_endpgm
4071;
4072; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4073; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4074; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4075; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4076; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4077; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4078; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4079; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4080; SKIP-CACHE-INV-NEXT:    s_endpgm
4081;
4082; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4083; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4084; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4085; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4086; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4087; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4088; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4089; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4090;
4091; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
4092; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4093; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4094; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4095; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4097; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4098; GFX90A-TGSPLIT-NEXT:    s_endpgm
4099    i32 addrspace(3)* %out, i32 %in) {
4100entry:
4101  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
4102  ret void
4103}
4104
4105define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
4106; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4107; GFX6:       ; %bb.0: ; %entry
4108; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4109; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4110; GFX6-NEXT:    s_mov_b32 m0, -1
4111; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4113; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4114; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4115; GFX6-NEXT:    s_endpgm
4116;
4117; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4118; GFX7:       ; %bb.0: ; %entry
4119; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4120; GFX7-NEXT:    s_mov_b32 m0, -1
4121; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4122; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4123; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4124; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4125; GFX7-NEXT:    s_endpgm
4126;
4127; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4128; GFX10-WGP:       ; %bb.0: ; %entry
4129; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4130; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4131; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4132; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4133; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4134; GFX10-WGP-NEXT:    s_endpgm
4135;
4136; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4137; GFX10-CU:       ; %bb.0: ; %entry
4138; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4139; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4140; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4141; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4142; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4143; GFX10-CU-NEXT:    s_endpgm
4144;
4145; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4146; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4147; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4148; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4149; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4151; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4152; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4153; SKIP-CACHE-INV-NEXT:    s_endpgm
4154;
4155; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4156; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4157; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4158; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4159; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4160; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4161; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4162; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4163;
4164; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
4165; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4166; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4167; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4168; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4169; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4170; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
4171; GFX90A-TGSPLIT-NEXT:    s_endpgm
4172    i32 addrspace(3)* %out, i32 %in) {
4173entry:
4174  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
4175  ret void
4176}
4177
4178define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
4179; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4180; GFX6:       ; %bb.0: ; %entry
4181; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4182; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4183; GFX6-NEXT:    s_mov_b32 m0, -1
4184; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4185; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4186; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4187; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4188; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4189; GFX6-NEXT:    ds_write_b32 v0, v1
4190; GFX6-NEXT:    s_endpgm
4191;
4192; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4193; GFX7:       ; %bb.0: ; %entry
4194; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4195; GFX7-NEXT:    s_mov_b32 m0, -1
4196; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4197; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4198; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4199; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4200; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4201; GFX7-NEXT:    ds_write_b32 v0, v1
4202; GFX7-NEXT:    s_endpgm
4203;
4204; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4205; GFX10-WGP:       ; %bb.0: ; %entry
4206; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4207; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4208; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4209; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4210; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4211; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4212; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4213; GFX10-WGP-NEXT:    s_endpgm
4214;
4215; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4216; GFX10-CU:       ; %bb.0: ; %entry
4217; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4218; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4219; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4220; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4221; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4222; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4223; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4224; GFX10-CU-NEXT:    s_endpgm
4225;
4226; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4227; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4229; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4230; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4233; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4234; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4235; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4236; SKIP-CACHE-INV-NEXT:    s_endpgm
4237;
4238; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4239; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4240; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4241; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4242; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4243; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4244; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4246; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4247; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4248;
4249; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
4250; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4251; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4252; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4253; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4254; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4255; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4256; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4257; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4258; GFX90A-TGSPLIT-NEXT:    s_endpgm
4259    i32 addrspace(3)* %out, i32 %in) {
4260entry:
4261  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
4262  store i32 %val, i32 addrspace(3)* %out, align 4
4263  ret void
4264}
4265
4266define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
4267; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4268; GFX6:       ; %bb.0: ; %entry
4269; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4270; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4271; GFX6-NEXT:    s_mov_b32 m0, -1
4272; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4273; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4274; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4275; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4276; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4277; GFX6-NEXT:    ds_write_b32 v0, v1
4278; GFX6-NEXT:    s_endpgm
4279;
4280; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4281; GFX7:       ; %bb.0: ; %entry
4282; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4283; GFX7-NEXT:    s_mov_b32 m0, -1
4284; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4285; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4286; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4287; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4288; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4289; GFX7-NEXT:    ds_write_b32 v0, v1
4290; GFX7-NEXT:    s_endpgm
4291;
4292; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4293; GFX10-WGP:       ; %bb.0: ; %entry
4294; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4295; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4296; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4297; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4298; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4299; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4300; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4301; GFX10-WGP-NEXT:    s_endpgm
4302;
4303; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4304; GFX10-CU:       ; %bb.0: ; %entry
4305; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4306; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4307; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4308; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4309; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4310; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4311; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4312; GFX10-CU-NEXT:    s_endpgm
4313;
4314; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4315; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4316; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4317; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4318; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4319; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4320; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4321; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4322; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4323; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4324; SKIP-CACHE-INV-NEXT:    s_endpgm
4325;
4326; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4327; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4328; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4329; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4330; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4331; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4332; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4333; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4334; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4335; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4336;
4337; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
4338; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4339; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4340; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4342; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4343; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4344; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4345; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4346; GFX90A-TGSPLIT-NEXT:    s_endpgm
4347    i32 addrspace(3)* %out, i32 %in) {
4348entry:
4349  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
4350  store i32 %val, i32 addrspace(3)* %out, align 4
4351  ret void
4352}
4353
4354define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
4355; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4356; GFX6:       ; %bb.0: ; %entry
4357; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4358; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4359; GFX6-NEXT:    s_mov_b32 m0, -1
4360; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4361; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4362; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4363; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4364; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4365; GFX6-NEXT:    ds_write_b32 v0, v1
4366; GFX6-NEXT:    s_endpgm
4367;
4368; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4369; GFX7:       ; %bb.0: ; %entry
4370; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4371; GFX7-NEXT:    s_mov_b32 m0, -1
4372; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4373; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4374; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4375; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4377; GFX7-NEXT:    ds_write_b32 v0, v1
4378; GFX7-NEXT:    s_endpgm
4379;
4380; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4381; GFX10-WGP:       ; %bb.0: ; %entry
4382; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4383; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4384; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4385; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4386; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4387; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4388; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4389; GFX10-WGP-NEXT:    s_endpgm
4390;
4391; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4392; GFX10-CU:       ; %bb.0: ; %entry
4393; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4394; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4395; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4396; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4397; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4398; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4399; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4400; GFX10-CU-NEXT:    s_endpgm
4401;
4402; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4403; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4404; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4405; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4406; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4409; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4410; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4411; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4412; SKIP-CACHE-INV-NEXT:    s_endpgm
4413;
4414; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4415; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4416; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4417; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4418; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4419; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4420; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4421; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4422; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4423; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4424;
4425; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
4426; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4427; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4428; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4429; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4430; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4431; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4432; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4433; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4434; GFX90A-TGSPLIT-NEXT:    s_endpgm
4435    i32 addrspace(3)* %out, i32 %in) {
4436entry:
4437  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
4438  store i32 %val, i32 addrspace(3)* %out, align 4
4439  ret void
4440}
4441
4442define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
4443; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4444; GFX6:       ; %bb.0: ; %entry
4445; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4446; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4447; GFX6-NEXT:    s_mov_b32 m0, -1
4448; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4449; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4450; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4451; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4452; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4453; GFX6-NEXT:    s_endpgm
4454;
4455; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4456; GFX7:       ; %bb.0: ; %entry
4457; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4458; GFX7-NEXT:    s_mov_b32 m0, -1
4459; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4460; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4461; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4462; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4463; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4464; GFX7-NEXT:    s_endpgm
4465;
4466; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4467; GFX10-WGP:       ; %bb.0: ; %entry
4468; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4469; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4470; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4471; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4472; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4473; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4474; GFX10-WGP-NEXT:    s_endpgm
4475;
4476; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4477; GFX10-CU:       ; %bb.0: ; %entry
4478; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4479; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4480; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4481; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4482; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4483; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4484; GFX10-CU-NEXT:    s_endpgm
4485;
4486; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4487; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4488; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4489; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4490; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4491; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4493; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4495; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4496; SKIP-CACHE-INV-NEXT:    s_endpgm
4497;
4498; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4499; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4500; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4501; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4502; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4503; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4504; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4505; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4506; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4507;
4508; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
4509; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4510; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4511; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4513; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4514; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4515; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4516; GFX90A-TGSPLIT-NEXT:    s_endpgm
4517    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4518entry:
4519  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4520  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
4521  ret void
4522}
4523
4524define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
4525; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4526; GFX6:       ; %bb.0: ; %entry
4527; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4528; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4529; GFX6-NEXT:    s_mov_b32 m0, -1
4530; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4531; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4532; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4533; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4534; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4535; GFX6-NEXT:    s_endpgm
4536;
4537; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4538; GFX7:       ; %bb.0: ; %entry
4539; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4540; GFX7-NEXT:    s_mov_b32 m0, -1
4541; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4542; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4543; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4544; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4545; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4546; GFX7-NEXT:    s_endpgm
4547;
4548; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4549; GFX10-WGP:       ; %bb.0: ; %entry
4550; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4551; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4552; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4553; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4554; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4555; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4556; GFX10-WGP-NEXT:    s_endpgm
4557;
4558; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4559; GFX10-CU:       ; %bb.0: ; %entry
4560; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4561; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4562; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4563; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4564; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4565; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4566; GFX10-CU-NEXT:    s_endpgm
4567;
4568; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4569; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4570; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4571; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4572; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4573; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4574; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4576; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4577; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4578; SKIP-CACHE-INV-NEXT:    s_endpgm
4579;
4580; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4581; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4582; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4583; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4584; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4585; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4586; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4587; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4588; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4589;
4590; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
4591; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4592; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4593; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4594; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4595; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4596; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4597; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4598; GFX90A-TGSPLIT-NEXT:    s_endpgm
4599    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4600entry:
4601  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4602  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
4603  ret void
4604}
4605
4606define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
4607; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4608; GFX6:       ; %bb.0: ; %entry
4609; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4610; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4611; GFX6-NEXT:    s_mov_b32 m0, -1
4612; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4613; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4614; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4615; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4616; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4617; GFX6-NEXT:    s_endpgm
4618;
4619; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4620; GFX7:       ; %bb.0: ; %entry
4621; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4622; GFX7-NEXT:    s_mov_b32 m0, -1
4623; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4624; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4625; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4626; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4627; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4628; GFX7-NEXT:    s_endpgm
4629;
4630; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4631; GFX10-WGP:       ; %bb.0: ; %entry
4632; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4633; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4634; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4635; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4636; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4637; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4638; GFX10-WGP-NEXT:    s_endpgm
4639;
4640; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4641; GFX10-CU:       ; %bb.0: ; %entry
4642; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4643; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4644; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4645; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4646; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4647; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4648; GFX10-CU-NEXT:    s_endpgm
4649;
4650; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4651; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4652; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4653; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4654; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4655; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4656; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4657; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4658; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4659; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4660; SKIP-CACHE-INV-NEXT:    s_endpgm
4661;
4662; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4663; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4664; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4665; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4666; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4667; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4668; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4669; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4670; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4671;
4672; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
4673; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4674; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4675; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4676; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4677; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4679; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4680; GFX90A-TGSPLIT-NEXT:    s_endpgm
4681    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4682entry:
4683  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4684  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
4685  ret void
4686}
4687
4688define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
4689; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4690; GFX6:       ; %bb.0: ; %entry
4691; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4692; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4693; GFX6-NEXT:    s_mov_b32 m0, -1
4694; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4695; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4696; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4697; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4698; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4699; GFX6-NEXT:    s_endpgm
4700;
4701; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4702; GFX7:       ; %bb.0: ; %entry
4703; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4704; GFX7-NEXT:    s_mov_b32 m0, -1
4705; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4706; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4707; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4708; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4709; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4710; GFX7-NEXT:    s_endpgm
4711;
4712; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4713; GFX10-WGP:       ; %bb.0: ; %entry
4714; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4715; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4717; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4718; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4719; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4720; GFX10-WGP-NEXT:    s_endpgm
4721;
4722; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4723; GFX10-CU:       ; %bb.0: ; %entry
4724; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4725; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4726; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4727; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4728; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4729; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4730; GFX10-CU-NEXT:    s_endpgm
4731;
4732; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4733; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4734; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4735; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4736; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4737; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4738; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4739; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4740; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4741; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4742; SKIP-CACHE-INV-NEXT:    s_endpgm
4743;
4744; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4745; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4746; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4747; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4748; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4749; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4750; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4751; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4752; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4753;
4754; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
4755; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4756; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4757; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4758; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4759; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4760; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4761; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4762; GFX90A-TGSPLIT-NEXT:    s_endpgm
4763    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4764entry:
4765  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4766  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
4767  ret void
4768}
4769
4770define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
4771; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4772; GFX6:       ; %bb.0: ; %entry
4773; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4774; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4775; GFX6-NEXT:    s_mov_b32 m0, -1
4776; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4777; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4778; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4779; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4780; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4781; GFX6-NEXT:    s_endpgm
4782;
4783; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4784; GFX7:       ; %bb.0: ; %entry
4785; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4786; GFX7-NEXT:    s_mov_b32 m0, -1
4787; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4788; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4789; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4790; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4791; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4792; GFX7-NEXT:    s_endpgm
4793;
4794; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4795; GFX10-WGP:       ; %bb.0: ; %entry
4796; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4797; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4798; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4799; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4800; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4801; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4802; GFX10-WGP-NEXT:    s_endpgm
4803;
4804; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4805; GFX10-CU:       ; %bb.0: ; %entry
4806; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4807; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4808; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4809; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4810; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4811; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4812; GFX10-CU-NEXT:    s_endpgm
4813;
4814; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4815; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4816; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4817; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4818; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4819; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4820; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4821; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4822; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4823; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4824; SKIP-CACHE-INV-NEXT:    s_endpgm
4825;
4826; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4827; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4828; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4829; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4830; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4831; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4832; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4833; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4834; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4835;
4836; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4837; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4838; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4839; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4840; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4841; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4842; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4843; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4844; GFX90A-TGSPLIT-NEXT:    s_endpgm
4845    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4846entry:
4847  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4848  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
4849  ret void
4850}
4851
4852define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
4853; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4854; GFX6:       ; %bb.0: ; %entry
4855; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4856; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4857; GFX6-NEXT:    s_mov_b32 m0, -1
4858; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4859; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4860; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4861; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4862; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4863; GFX6-NEXT:    s_endpgm
4864;
4865; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4866; GFX7:       ; %bb.0: ; %entry
4867; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4868; GFX7-NEXT:    s_mov_b32 m0, -1
4869; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4870; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4871; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4872; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4873; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4874; GFX7-NEXT:    s_endpgm
4875;
4876; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4877; GFX10-WGP:       ; %bb.0: ; %entry
4878; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4879; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4880; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4881; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4882; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4883; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4884; GFX10-WGP-NEXT:    s_endpgm
4885;
4886; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4887; GFX10-CU:       ; %bb.0: ; %entry
4888; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4889; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4890; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4891; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4892; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4893; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4894; GFX10-CU-NEXT:    s_endpgm
4895;
4896; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4897; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4898; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4899; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4900; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4901; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4902; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4903; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4904; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4905; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4906; SKIP-CACHE-INV-NEXT:    s_endpgm
4907;
4908; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4909; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4910; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4911; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4912; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4913; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4914; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4915; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4916; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4917;
4918; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
4919; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4920; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4921; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4922; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4923; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4924; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4925; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4926; GFX90A-TGSPLIT-NEXT:    s_endpgm
4927    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4928entry:
4929  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4930  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
4931  ret void
4932}
4933
4934define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
4935; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4936; GFX6:       ; %bb.0: ; %entry
4937; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4938; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4939; GFX6-NEXT:    s_mov_b32 m0, -1
4940; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4941; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4942; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4943; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4944; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4945; GFX6-NEXT:    s_endpgm
4946;
4947; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4948; GFX7:       ; %bb.0: ; %entry
4949; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4950; GFX7-NEXT:    s_mov_b32 m0, -1
4951; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4952; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4953; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4954; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4955; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4956; GFX7-NEXT:    s_endpgm
4957;
4958; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4959; GFX10-WGP:       ; %bb.0: ; %entry
4960; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4961; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4962; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4963; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4964; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4965; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4966; GFX10-WGP-NEXT:    s_endpgm
4967;
4968; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4969; GFX10-CU:       ; %bb.0: ; %entry
4970; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4971; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4972; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4973; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4974; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4975; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4976; GFX10-CU-NEXT:    s_endpgm
4977;
4978; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4979; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4980; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4981; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4982; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4983; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4985; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4986; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4987; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4988; SKIP-CACHE-INV-NEXT:    s_endpgm
4989;
4990; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
4991; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4992; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4993; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4994; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4995; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4996; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4997; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4998; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4999;
5000; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
5001; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5002; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5003; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5004; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5005; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5006; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5007; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5008; GFX90A-TGSPLIT-NEXT:    s_endpgm
5009    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5010entry:
5011  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5012  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
5013  ret void
5014}
5015
5016define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
5017; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5018; GFX6:       ; %bb.0: ; %entry
5019; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5020; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5021; GFX6-NEXT:    s_mov_b32 m0, -1
5022; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5023; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5024; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5025; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5026; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5027; GFX6-NEXT:    s_endpgm
5028;
5029; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5030; GFX7:       ; %bb.0: ; %entry
5031; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5032; GFX7-NEXT:    s_mov_b32 m0, -1
5033; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5034; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5035; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5036; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5037; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5038; GFX7-NEXT:    s_endpgm
5039;
5040; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5041; GFX10-WGP:       ; %bb.0: ; %entry
5042; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5043; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5044; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5045; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5046; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5047; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5048; GFX10-WGP-NEXT:    s_endpgm
5049;
5050; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5051; GFX10-CU:       ; %bb.0: ; %entry
5052; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5053; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5055; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5056; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5057; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5058; GFX10-CU-NEXT:    s_endpgm
5059;
5060; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5061; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5062; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5063; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5064; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5065; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5066; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5069; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5070; SKIP-CACHE-INV-NEXT:    s_endpgm
5071;
5072; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5073; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5074; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5075; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5076; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5077; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5078; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5079; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5080; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5081;
5082; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
5083; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5084; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5085; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5086; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5087; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5088; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5089; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5090; GFX90A-TGSPLIT-NEXT:    s_endpgm
5091    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5092entry:
5093  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5094  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
5095  ret void
5096}
5097
5098define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
5099; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5100; GFX6:       ; %bb.0: ; %entry
5101; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5102; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5103; GFX6-NEXT:    s_mov_b32 m0, -1
5104; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5105; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5106; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5107; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5108; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5109; GFX6-NEXT:    s_endpgm
5110;
5111; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5112; GFX7:       ; %bb.0: ; %entry
5113; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5114; GFX7-NEXT:    s_mov_b32 m0, -1
5115; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5116; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5117; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5118; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5119; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5120; GFX7-NEXT:    s_endpgm
5121;
5122; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5123; GFX10-WGP:       ; %bb.0: ; %entry
5124; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5125; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5126; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5127; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5128; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5129; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5130; GFX10-WGP-NEXT:    s_endpgm
5131;
5132; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5133; GFX10-CU:       ; %bb.0: ; %entry
5134; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5135; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5136; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5137; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5138; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5139; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5140; GFX10-CU-NEXT:    s_endpgm
5141;
5142; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5143; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5144; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5145; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5146; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5147; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5148; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5151; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5152; SKIP-CACHE-INV-NEXT:    s_endpgm
5153;
5154; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5155; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5156; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5157; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5158; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5159; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5160; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5161; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5162; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5163;
5164; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
5165; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5166; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5167; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5168; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5169; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5170; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5171; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5172; GFX90A-TGSPLIT-NEXT:    s_endpgm
5173    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5174entry:
5175  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5176  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
5177  ret void
5178}
5179
5180define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
5181; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5182; GFX6:       ; %bb.0: ; %entry
5183; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5184; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5185; GFX6-NEXT:    s_mov_b32 m0, -1
5186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5187; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5188; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5189; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5190; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5191; GFX6-NEXT:    s_endpgm
5192;
5193; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5194; GFX7:       ; %bb.0: ; %entry
5195; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5196; GFX7-NEXT:    s_mov_b32 m0, -1
5197; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5198; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5199; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5200; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5201; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5202; GFX7-NEXT:    s_endpgm
5203;
5204; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5205; GFX10-WGP:       ; %bb.0: ; %entry
5206; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5207; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5208; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5209; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5210; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5211; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5212; GFX10-WGP-NEXT:    s_endpgm
5213;
5214; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5215; GFX10-CU:       ; %bb.0: ; %entry
5216; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5217; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5218; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5219; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5220; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5221; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5222; GFX10-CU-NEXT:    s_endpgm
5223;
5224; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5225; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5226; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5227; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5228; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5229; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5233; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5234; SKIP-CACHE-INV-NEXT:    s_endpgm
5235;
5236; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5237; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5238; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5239; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5240; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5241; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5242; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5243; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5244; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5245;
5246; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
5247; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5248; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5249; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5250; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5251; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5252; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5253; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5254; GFX90A-TGSPLIT-NEXT:    s_endpgm
5255    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5256entry:
5257  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5258  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
5259  ret void
5260}
5261
5262define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
5263; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5264; GFX6:       ; %bb.0: ; %entry
5265; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5266; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5267; GFX6-NEXT:    s_mov_b32 m0, -1
5268; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5269; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5270; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5271; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5272; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5273; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5274; GFX6-NEXT:    ds_write_b32 v0, v1
5275; GFX6-NEXT:    s_endpgm
5276;
5277; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5278; GFX7:       ; %bb.0: ; %entry
5279; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5280; GFX7-NEXT:    s_mov_b32 m0, -1
5281; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5282; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5283; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5284; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5285; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5286; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5287; GFX7-NEXT:    ds_write_b32 v0, v1
5288; GFX7-NEXT:    s_endpgm
5289;
5290; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5291; GFX10-WGP:       ; %bb.0: ; %entry
5292; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5293; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5294; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5295; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5296; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5297; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5298; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5299; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5300; GFX10-WGP-NEXT:    s_endpgm
5301;
5302; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5303; GFX10-CU:       ; %bb.0: ; %entry
5304; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5305; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5306; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5307; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5308; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5309; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5310; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5311; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5312; GFX10-CU-NEXT:    s_endpgm
5313;
5314; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5315; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5316; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5317; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5318; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5319; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5320; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5321; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5322; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5323; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5324; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5325; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5326; SKIP-CACHE-INV-NEXT:    s_endpgm
5327;
5328; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5329; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5330; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5331; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5332; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5333; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5334; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5335; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5336; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5337; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5338; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5339;
5340; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
5341; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5342; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5343; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5344; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5345; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5346; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5347; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5348; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5349; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5350; GFX90A-TGSPLIT-NEXT:    s_endpgm
5351    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5352entry:
5353  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5354  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
5355  %val0 = extractvalue { i32, i1 } %val, 0
5356  store i32 %val0, i32 addrspace(3)* %out, align 4
5357  ret void
5358}
5359
5360define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
5361; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5362; GFX6:       ; %bb.0: ; %entry
5363; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5364; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5365; GFX6-NEXT:    s_mov_b32 m0, -1
5366; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5367; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5368; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5369; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5370; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5371; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5372; GFX6-NEXT:    ds_write_b32 v0, v1
5373; GFX6-NEXT:    s_endpgm
5374;
5375; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5376; GFX7:       ; %bb.0: ; %entry
5377; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5378; GFX7-NEXT:    s_mov_b32 m0, -1
5379; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5380; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5381; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5382; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5383; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5384; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5385; GFX7-NEXT:    ds_write_b32 v0, v1
5386; GFX7-NEXT:    s_endpgm
5387;
5388; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5389; GFX10-WGP:       ; %bb.0: ; %entry
5390; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5391; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5392; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5393; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5394; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5395; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5396; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5397; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5398; GFX10-WGP-NEXT:    s_endpgm
5399;
5400; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5401; GFX10-CU:       ; %bb.0: ; %entry
5402; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5403; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5404; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5405; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5406; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5407; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5408; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5409; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5410; GFX10-CU-NEXT:    s_endpgm
5411;
5412; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5413; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5414; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5415; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5416; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5417; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5419; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5420; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5421; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5422; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5423; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5424; SKIP-CACHE-INV-NEXT:    s_endpgm
5425;
5426; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5427; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5428; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5429; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5430; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5431; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5432; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5433; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5434; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5435; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5436; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5437;
5438; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
5439; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5440; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5441; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5442; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5443; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5444; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5445; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5446; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5447; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5448; GFX90A-TGSPLIT-NEXT:    s_endpgm
5449    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5450entry:
5451  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5452  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
5453  %val0 = extractvalue { i32, i1 } %val, 0
5454  store i32 %val0, i32 addrspace(3)* %out, align 4
5455  ret void
5456}
5457
5458define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
5459; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5460; GFX6:       ; %bb.0: ; %entry
5461; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5462; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5463; GFX6-NEXT:    s_mov_b32 m0, -1
5464; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5465; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5466; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5467; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5468; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5470; GFX6-NEXT:    ds_write_b32 v0, v1
5471; GFX6-NEXT:    s_endpgm
5472;
5473; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5474; GFX7:       ; %bb.0: ; %entry
5475; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5476; GFX7-NEXT:    s_mov_b32 m0, -1
5477; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5478; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5479; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5480; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5481; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5482; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5483; GFX7-NEXT:    ds_write_b32 v0, v1
5484; GFX7-NEXT:    s_endpgm
5485;
5486; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5487; GFX10-WGP:       ; %bb.0: ; %entry
5488; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5489; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5490; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5491; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5492; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5493; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5494; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5495; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5496; GFX10-WGP-NEXT:    s_endpgm
5497;
5498; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5499; GFX10-CU:       ; %bb.0: ; %entry
5500; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5501; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5502; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5503; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5504; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5505; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5506; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5507; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5508; GFX10-CU-NEXT:    s_endpgm
5509;
5510; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5511; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5512; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5513; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5514; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5515; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5516; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5517; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5518; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5519; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5520; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5521; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5522; SKIP-CACHE-INV-NEXT:    s_endpgm
5523;
5524; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5525; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5526; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5527; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5528; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5529; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5530; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5531; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5532; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5533; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5534; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5535;
5536; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
5537; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5538; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5539; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5540; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5541; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5542; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5543; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5544; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5545; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5546; GFX90A-TGSPLIT-NEXT:    s_endpgm
5547    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5548entry:
5549  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5550  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
5551  %val0 = extractvalue { i32, i1 } %val, 0
5552  store i32 %val0, i32 addrspace(3)* %out, align 4
5553  ret void
5554}
5555
5556define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
5557; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5558; GFX6:       ; %bb.0: ; %entry
5559; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5560; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5561; GFX6-NEXT:    s_mov_b32 m0, -1
5562; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5563; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5564; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5565; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5566; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5567; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5568; GFX6-NEXT:    ds_write_b32 v0, v1
5569; GFX6-NEXT:    s_endpgm
5570;
5571; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5572; GFX7:       ; %bb.0: ; %entry
5573; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5574; GFX7-NEXT:    s_mov_b32 m0, -1
5575; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5576; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5577; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5578; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5579; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5580; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5581; GFX7-NEXT:    ds_write_b32 v0, v1
5582; GFX7-NEXT:    s_endpgm
5583;
5584; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5585; GFX10-WGP:       ; %bb.0: ; %entry
5586; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5587; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5588; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5589; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5590; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5591; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5592; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5593; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5594; GFX10-WGP-NEXT:    s_endpgm
5595;
5596; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5597; GFX10-CU:       ; %bb.0: ; %entry
5598; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5599; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5600; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5601; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5602; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5603; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5604; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5605; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5606; GFX10-CU-NEXT:    s_endpgm
5607;
5608; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5609; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5610; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5611; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5612; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5613; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5616; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5617; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5618; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5619; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5620; SKIP-CACHE-INV-NEXT:    s_endpgm
5621;
5622; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5623; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5624; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5625; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5626; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5627; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5628; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5629; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5630; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5631; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5632; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5633;
5634; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
5635; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5636; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5637; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5638; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5639; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5640; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5641; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5642; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5643; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5644; GFX90A-TGSPLIT-NEXT:    s_endpgm
5645    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5646entry:
5647  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5648  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
5649  %val0 = extractvalue { i32, i1 } %val, 0
5650  store i32 %val0, i32 addrspace(3)* %out, align 4
5651  ret void
5652}
5653
5654define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
5655; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5656; GFX6:       ; %bb.0: ; %entry
5657; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5658; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5659; GFX6-NEXT:    s_mov_b32 m0, -1
5660; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5661; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5662; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5663; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5664; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5665; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5666; GFX6-NEXT:    ds_write_b32 v0, v1
5667; GFX6-NEXT:    s_endpgm
5668;
5669; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5670; GFX7:       ; %bb.0: ; %entry
5671; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5672; GFX7-NEXT:    s_mov_b32 m0, -1
5673; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5674; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5675; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5676; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5677; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5678; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5679; GFX7-NEXT:    ds_write_b32 v0, v1
5680; GFX7-NEXT:    s_endpgm
5681;
5682; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5683; GFX10-WGP:       ; %bb.0: ; %entry
5684; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5685; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5686; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5687; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5688; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5689; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5690; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5691; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5692; GFX10-WGP-NEXT:    s_endpgm
5693;
5694; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5695; GFX10-CU:       ; %bb.0: ; %entry
5696; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5697; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5698; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5699; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5700; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5701; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5702; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5703; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5704; GFX10-CU-NEXT:    s_endpgm
5705;
5706; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5707; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5708; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5709; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5710; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5711; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5715; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5716; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5717; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5718; SKIP-CACHE-INV-NEXT:    s_endpgm
5719;
5720; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5721; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5722; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5723; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5724; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5725; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5726; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5727; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5728; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5729; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5730; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5731;
5732; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
5733; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5734; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5735; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5736; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5737; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5738; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5739; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5740; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5741; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5742; GFX90A-TGSPLIT-NEXT:    s_endpgm
5743    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5744entry:
5745  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5746  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
5747  %val0 = extractvalue { i32, i1 } %val, 0
5748  store i32 %val0, i32 addrspace(3)* %out, align 4
5749  ret void
5750}
5751
5752define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
5753; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5754; GFX6:       ; %bb.0: ; %entry
5755; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5756; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5757; GFX6-NEXT:    s_mov_b32 m0, -1
5758; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5759; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5760; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5761; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5762; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5763; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5764; GFX6-NEXT:    ds_write_b32 v0, v1
5765; GFX6-NEXT:    s_endpgm
5766;
5767; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5768; GFX7:       ; %bb.0: ; %entry
5769; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5770; GFX7-NEXT:    s_mov_b32 m0, -1
5771; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5772; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5773; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5774; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5775; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5776; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5777; GFX7-NEXT:    ds_write_b32 v0, v1
5778; GFX7-NEXT:    s_endpgm
5779;
5780; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5781; GFX10-WGP:       ; %bb.0: ; %entry
5782; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5783; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5784; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5785; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5786; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5787; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5788; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5789; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5790; GFX10-WGP-NEXT:    s_endpgm
5791;
5792; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5793; GFX10-CU:       ; %bb.0: ; %entry
5794; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5795; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5796; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5797; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5798; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5799; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5800; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5801; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5802; GFX10-CU-NEXT:    s_endpgm
5803;
5804; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5805; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5806; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5807; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5808; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5811; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5813; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5814; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5815; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5816; SKIP-CACHE-INV-NEXT:    s_endpgm
5817;
5818; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5819; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5820; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5821; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5822; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5823; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5824; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5825; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5826; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5827; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5828; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5829;
5830; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
5831; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5832; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5833; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5834; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5835; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5836; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5837; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5838; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5839; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5840; GFX90A-TGSPLIT-NEXT:    s_endpgm
5841    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5842entry:
5843  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5844  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
5845  %val0 = extractvalue { i32, i1 } %val, 0
5846  store i32 %val0, i32 addrspace(3)* %out, align 4
5847  ret void
5848}
5849
5850define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
5851; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5852; GFX6:       ; %bb.0: ; %entry
5853; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5854; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5855; GFX6-NEXT:    s_mov_b32 m0, -1
5856; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5857; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5858; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5859; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5860; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5861; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5862; GFX6-NEXT:    ds_write_b32 v0, v1
5863; GFX6-NEXT:    s_endpgm
5864;
5865; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5866; GFX7:       ; %bb.0: ; %entry
5867; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5868; GFX7-NEXT:    s_mov_b32 m0, -1
5869; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5870; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5871; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5872; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5873; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5874; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5875; GFX7-NEXT:    ds_write_b32 v0, v1
5876; GFX7-NEXT:    s_endpgm
5877;
5878; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5879; GFX10-WGP:       ; %bb.0: ; %entry
5880; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5881; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5882; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5883; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5884; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5885; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5886; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5887; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5888; GFX10-WGP-NEXT:    s_endpgm
5889;
5890; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5891; GFX10-CU:       ; %bb.0: ; %entry
5892; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5893; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5895; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5896; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5897; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5898; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5899; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5900; GFX10-CU-NEXT:    s_endpgm
5901;
5902; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5903; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5904; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5905; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5906; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5907; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5908; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5909; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5910; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5911; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5912; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5913; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5914; SKIP-CACHE-INV-NEXT:    s_endpgm
5915;
5916; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5917; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5918; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5919; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5920; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5921; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5922; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5923; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5924; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5925; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5926; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5927;
5928; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5929; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5930; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5931; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5932; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5933; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5934; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5935; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5936; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5937; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5938; GFX90A-TGSPLIT-NEXT:    s_endpgm
5939    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5940entry:
5941  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5942  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
5943  %val0 = extractvalue { i32, i1 } %val, 0
5944  store i32 %val0, i32 addrspace(3)* %out, align 4
5945  ret void
5946}
5947
5948define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
5949; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5950; GFX6:       ; %bb.0: ; %entry
5951; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5952; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5953; GFX6-NEXT:    s_mov_b32 m0, -1
5954; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5955; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5956; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5957; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5958; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5959; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5960; GFX6-NEXT:    ds_write_b32 v0, v1
5961; GFX6-NEXT:    s_endpgm
5962;
5963; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5964; GFX7:       ; %bb.0: ; %entry
5965; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5966; GFX7-NEXT:    s_mov_b32 m0, -1
5967; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5968; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5969; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5970; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5971; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5972; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5973; GFX7-NEXT:    ds_write_b32 v0, v1
5974; GFX7-NEXT:    s_endpgm
5975;
5976; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5977; GFX10-WGP:       ; %bb.0: ; %entry
5978; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5979; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5980; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5981; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5982; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5983; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5984; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5985; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5986; GFX10-WGP-NEXT:    s_endpgm
5987;
5988; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5989; GFX10-CU:       ; %bb.0: ; %entry
5990; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5991; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5992; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5993; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5994; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5995; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5996; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5997; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5998; GFX10-CU-NEXT:    s_endpgm
5999;
6000; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
6001; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6002; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6003; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
6004; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
6005; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6006; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6007; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
6008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
6009; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
6010; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6011; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
6012; SKIP-CACHE-INV-NEXT:    s_endpgm
6013;
6014; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
6015; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6016; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6017; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6018; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6019; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6020; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
6021; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
6022; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6023; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
6024; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6025;
6026; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
6027; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6028; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
6029; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6030; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
6031; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6032; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
6033; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
6034; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6035; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
6036; GFX90A-TGSPLIT-NEXT:    s_endpgm
6037    i32 addrspace(3)* %out, i32 %in, i32 %old) {
6038entry:
6039  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
6040  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
6041  %val0 = extractvalue { i32, i1 } %val, 0
6042  store i32 %val0, i32 addrspace(3)* %out, align 4
6043  ret void
6044}
6045
6046