1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9
10define amdgpu_kernel void @local_singlethread_unordered_load(
11; GFX6-LABEL: local_singlethread_unordered_load:
12; GFX6:       ; %bb.0: ; %entry
13; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
14; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
15; GFX6-NEXT:    s_mov_b32 m0, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    v_mov_b32_e32 v0, s0
18; GFX6-NEXT:    ds_read_b32 v0, v0
19; GFX6-NEXT:    v_mov_b32_e32 v1, s1
20; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX6-NEXT:    ds_write_b32 v1, v0
22; GFX6-NEXT:    s_endpgm
23;
24; GFX7-LABEL: local_singlethread_unordered_load:
25; GFX7:       ; %bb.0: ; %entry
26; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
27; GFX7-NEXT:    s_mov_b32 m0, -1
28; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    ds_read_b32 v0, v0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    ds_write_b32 v1, v0
34; GFX7-NEXT:    s_endpgm
35;
36; GFX10-WGP-LABEL: local_singlethread_unordered_load:
37; GFX10-WGP:       ; %bb.0: ; %entry
38; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
39; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
45; GFX10-WGP-NEXT:    s_endpgm
46;
47; GFX10-CU-LABEL: local_singlethread_unordered_load:
48; GFX10-CU:       ; %bb.0: ; %entry
49; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
50; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
52; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
53; GFX10-CU-NEXT:    ds_read_b32 v0, v0
54; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-CU-NEXT:    ds_write_b32 v1, v0
56; GFX10-CU-NEXT:    s_endpgm
57;
58; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_load:
59; SKIP-CACHE-INV:       ; %bb.0: ; %entry
60; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
61; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
64; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
65; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
66; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
67; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
68; SKIP-CACHE-INV-NEXT:    s_endpgm
69;
70; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load:
71; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
72; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
73; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
75; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
76; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
77; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
79; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
80;
81; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load:
82; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
83; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
84; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
86; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
87; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
90; GFX90A-TGSPLIT-NEXT:    s_endpgm
91;
92;
93    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
94entry:
95  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4
96  store i32 %val, i32 addrspace(3)* %out
97  ret void
98}
99
100define amdgpu_kernel void @local_singlethread_monotonic_load(
101; GFX6-LABEL: local_singlethread_monotonic_load:
102; GFX6:       ; %bb.0: ; %entry
103; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
104; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
105; GFX6-NEXT:    s_mov_b32 m0, -1
106; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX6-NEXT:    v_mov_b32_e32 v0, s0
108; GFX6-NEXT:    ds_read_b32 v0, v0
109; GFX6-NEXT:    v_mov_b32_e32 v1, s1
110; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX6-NEXT:    ds_write_b32 v1, v0
112; GFX6-NEXT:    s_endpgm
113;
114; GFX7-LABEL: local_singlethread_monotonic_load:
115; GFX7:       ; %bb.0: ; %entry
116; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
117; GFX7-NEXT:    s_mov_b32 m0, -1
118; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
119; GFX7-NEXT:    v_mov_b32_e32 v0, s0
120; GFX7-NEXT:    ds_read_b32 v0, v0
121; GFX7-NEXT:    v_mov_b32_e32 v1, s1
122; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX7-NEXT:    ds_write_b32 v1, v0
124; GFX7-NEXT:    s_endpgm
125;
126; GFX10-WGP-LABEL: local_singlethread_monotonic_load:
127; GFX10-WGP:       ; %bb.0: ; %entry
128; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
129; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
131; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
132; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
133; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
135; GFX10-WGP-NEXT:    s_endpgm
136;
137; GFX10-CU-LABEL: local_singlethread_monotonic_load:
138; GFX10-CU:       ; %bb.0: ; %entry
139; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
140; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
142; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
143; GFX10-CU-NEXT:    ds_read_b32 v0, v0
144; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX10-CU-NEXT:    ds_write_b32 v1, v0
146; GFX10-CU-NEXT:    s_endpgm
147;
148; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_load:
149; SKIP-CACHE-INV:       ; %bb.0: ; %entry
150; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
151; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
152; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
153; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
154; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
155; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
156; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
157; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
158; SKIP-CACHE-INV-NEXT:    s_endpgm
159;
160; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load:
161; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
162; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
163; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
165; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
166; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
167; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
169; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
170;
171; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load:
172; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
173; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
174; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
176; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
177; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
178; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
180; GFX90A-TGSPLIT-NEXT:    s_endpgm
181;
182;
183    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
184entry:
185  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4
186  store i32 %val, i32 addrspace(3)* %out
187  ret void
188}
189
190define amdgpu_kernel void @local_singlethread_acquire_load(
191; GFX6-LABEL: local_singlethread_acquire_load:
192; GFX6:       ; %bb.0: ; %entry
193; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
194; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
195; GFX6-NEXT:    s_mov_b32 m0, -1
196; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX6-NEXT:    v_mov_b32_e32 v0, s0
198; GFX6-NEXT:    ds_read_b32 v0, v0
199; GFX6-NEXT:    v_mov_b32_e32 v1, s1
200; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX6-NEXT:    ds_write_b32 v1, v0
202; GFX6-NEXT:    s_endpgm
203;
204; GFX7-LABEL: local_singlethread_acquire_load:
205; GFX7:       ; %bb.0: ; %entry
206; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
207; GFX7-NEXT:    s_mov_b32 m0, -1
208; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX7-NEXT:    v_mov_b32_e32 v0, s0
210; GFX7-NEXT:    ds_read_b32 v0, v0
211; GFX7-NEXT:    v_mov_b32_e32 v1, s1
212; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX7-NEXT:    ds_write_b32 v1, v0
214; GFX7-NEXT:    s_endpgm
215;
216; GFX10-WGP-LABEL: local_singlethread_acquire_load:
217; GFX10-WGP:       ; %bb.0: ; %entry
218; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
219; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
221; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
222; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
223; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
225; GFX10-WGP-NEXT:    s_endpgm
226;
227; GFX10-CU-LABEL: local_singlethread_acquire_load:
228; GFX10-CU:       ; %bb.0: ; %entry
229; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
233; GFX10-CU-NEXT:    ds_read_b32 v0, v0
234; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX10-CU-NEXT:    ds_write_b32 v1, v0
236; GFX10-CU-NEXT:    s_endpgm
237;
238; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_load:
239; SKIP-CACHE-INV:       ; %bb.0: ; %entry
240; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
241; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
242; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
243; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
244; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
246; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
247; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
248; SKIP-CACHE-INV-NEXT:    s_endpgm
249;
250; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load:
251; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
252; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
253; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
255; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
256; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
257; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
259; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
260;
261; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load:
262; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
263; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
264; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
266; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
267; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
268; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
270; GFX90A-TGSPLIT-NEXT:    s_endpgm
271;
272;
273    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
274entry:
275  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4
276  store i32 %val, i32 addrspace(3)* %out
277  ret void
278}
279
280define amdgpu_kernel void @local_singlethread_seq_cst_load(
281; GFX6-LABEL: local_singlethread_seq_cst_load:
282; GFX6:       ; %bb.0: ; %entry
283; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
284; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
285; GFX6-NEXT:    s_mov_b32 m0, -1
286; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX6-NEXT:    v_mov_b32_e32 v0, s0
288; GFX6-NEXT:    ds_read_b32 v0, v0
289; GFX6-NEXT:    v_mov_b32_e32 v1, s1
290; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX6-NEXT:    ds_write_b32 v1, v0
292; GFX6-NEXT:    s_endpgm
293;
294; GFX7-LABEL: local_singlethread_seq_cst_load:
295; GFX7:       ; %bb.0: ; %entry
296; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
297; GFX7-NEXT:    s_mov_b32 m0, -1
298; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX7-NEXT:    v_mov_b32_e32 v0, s0
300; GFX7-NEXT:    ds_read_b32 v0, v0
301; GFX7-NEXT:    v_mov_b32_e32 v1, s1
302; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX7-NEXT:    ds_write_b32 v1, v0
304; GFX7-NEXT:    s_endpgm
305;
306; GFX10-WGP-LABEL: local_singlethread_seq_cst_load:
307; GFX10-WGP:       ; %bb.0: ; %entry
308; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
309; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
311; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
312; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
313; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
315; GFX10-WGP-NEXT:    s_endpgm
316;
317; GFX10-CU-LABEL: local_singlethread_seq_cst_load:
318; GFX10-CU:       ; %bb.0: ; %entry
319; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
320; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
322; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
323; GFX10-CU-NEXT:    ds_read_b32 v0, v0
324; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX10-CU-NEXT:    ds_write_b32 v1, v0
326; GFX10-CU-NEXT:    s_endpgm
327;
328; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_load:
329; SKIP-CACHE-INV:       ; %bb.0: ; %entry
330; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
331; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
332; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
333; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
334; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
336; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
337; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
338; SKIP-CACHE-INV-NEXT:    s_endpgm
339;
340; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load:
341; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
342; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
343; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
345; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
346; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
347; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
349; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
350;
351; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load:
352; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
354; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
356; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
358; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
360; GFX90A-TGSPLIT-NEXT:    s_endpgm
361;
362;
363    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
364entry:
365  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4
366  store i32 %val, i32 addrspace(3)* %out
367  ret void
368}
369
370define amdgpu_kernel void @local_singlethread_unordered_store(
371; GFX6-LABEL: local_singlethread_unordered_store:
372; GFX6:       ; %bb.0: ; %entry
373; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
374; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
375; GFX6-NEXT:    s_mov_b32 m0, -1
376; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX6-NEXT:    v_mov_b32_e32 v1, s0
378; GFX6-NEXT:    v_mov_b32_e32 v0, s1
379; GFX6-NEXT:    ds_write_b32 v0, v1
380; GFX6-NEXT:    s_endpgm
381;
382; GFX7-LABEL: local_singlethread_unordered_store:
383; GFX7:       ; %bb.0: ; %entry
384; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
385; GFX7-NEXT:    s_mov_b32 m0, -1
386; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX7-NEXT:    v_mov_b32_e32 v0, s1
388; GFX7-NEXT:    v_mov_b32_e32 v1, s0
389; GFX7-NEXT:    ds_write_b32 v0, v1
390; GFX7-NEXT:    s_endpgm
391;
392; GFX10-WGP-LABEL: local_singlethread_unordered_store:
393; GFX10-WGP:       ; %bb.0: ; %entry
394; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
395; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
397; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
398; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
399; GFX10-WGP-NEXT:    s_endpgm
400;
401; GFX10-CU-LABEL: local_singlethread_unordered_store:
402; GFX10-CU:       ; %bb.0: ; %entry
403; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
404; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
406; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
407; GFX10-CU-NEXT:    ds_write_b32 v0, v1
408; GFX10-CU-NEXT:    s_endpgm
409;
410; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_store:
411; SKIP-CACHE-INV:       ; %bb.0: ; %entry
412; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
413; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
414; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
415; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
417; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
418; SKIP-CACHE-INV-NEXT:    s_endpgm
419;
420; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store:
421; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
422; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
423; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
425; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
426; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
427; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
428;
429; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store:
430; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
431; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
432; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
434; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
435; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
436; GFX90A-TGSPLIT-NEXT:    s_endpgm
437;
438;
439    i32 %in, i32 addrspace(3)* %out) {
440entry:
441  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4
442  ret void
443}
444
445define amdgpu_kernel void @local_singlethread_monotonic_store(
446; GFX6-LABEL: local_singlethread_monotonic_store:
447; GFX6:       ; %bb.0: ; %entry
448; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
449; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
450; GFX6-NEXT:    s_mov_b32 m0, -1
451; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX6-NEXT:    v_mov_b32_e32 v1, s0
453; GFX6-NEXT:    v_mov_b32_e32 v0, s1
454; GFX6-NEXT:    ds_write_b32 v0, v1
455; GFX6-NEXT:    s_endpgm
456;
457; GFX7-LABEL: local_singlethread_monotonic_store:
458; GFX7:       ; %bb.0: ; %entry
459; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
460; GFX7-NEXT:    s_mov_b32 m0, -1
461; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX7-NEXT:    v_mov_b32_e32 v0, s1
463; GFX7-NEXT:    v_mov_b32_e32 v1, s0
464; GFX7-NEXT:    ds_write_b32 v0, v1
465; GFX7-NEXT:    s_endpgm
466;
467; GFX10-WGP-LABEL: local_singlethread_monotonic_store:
468; GFX10-WGP:       ; %bb.0: ; %entry
469; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
470; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
472; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
473; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
474; GFX10-WGP-NEXT:    s_endpgm
475;
476; GFX10-CU-LABEL: local_singlethread_monotonic_store:
477; GFX10-CU:       ; %bb.0: ; %entry
478; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
479; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
481; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
482; GFX10-CU-NEXT:    ds_write_b32 v0, v1
483; GFX10-CU-NEXT:    s_endpgm
484;
485; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_store:
486; SKIP-CACHE-INV:       ; %bb.0: ; %entry
487; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
488; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
489; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
492; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
493; SKIP-CACHE-INV-NEXT:    s_endpgm
494;
495; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store:
496; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
498; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
500; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
501; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
503;
504; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store:
505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
507; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
509; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
510; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
511; GFX90A-TGSPLIT-NEXT:    s_endpgm
512;
513;
514    i32 %in, i32 addrspace(3)* %out) {
515entry:
516  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4
517  ret void
518}
519
520define amdgpu_kernel void @local_singlethread_release_store(
521; GFX6-LABEL: local_singlethread_release_store:
522; GFX6:       ; %bb.0: ; %entry
523; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
524; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
525; GFX6-NEXT:    s_mov_b32 m0, -1
526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX6-NEXT:    v_mov_b32_e32 v1, s0
528; GFX6-NEXT:    v_mov_b32_e32 v0, s1
529; GFX6-NEXT:    ds_write_b32 v0, v1
530; GFX6-NEXT:    s_endpgm
531;
532; GFX7-LABEL: local_singlethread_release_store:
533; GFX7:       ; %bb.0: ; %entry
534; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
535; GFX7-NEXT:    s_mov_b32 m0, -1
536; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX7-NEXT:    v_mov_b32_e32 v0, s1
538; GFX7-NEXT:    v_mov_b32_e32 v1, s0
539; GFX7-NEXT:    ds_write_b32 v0, v1
540; GFX7-NEXT:    s_endpgm
541;
542; GFX10-WGP-LABEL: local_singlethread_release_store:
543; GFX10-WGP:       ; %bb.0: ; %entry
544; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
545; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
546; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
547; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
548; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
549; GFX10-WGP-NEXT:    s_endpgm
550;
551; GFX10-CU-LABEL: local_singlethread_release_store:
552; GFX10-CU:       ; %bb.0: ; %entry
553; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
554; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
556; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
557; GFX10-CU-NEXT:    ds_write_b32 v0, v1
558; GFX10-CU-NEXT:    s_endpgm
559;
560; SKIP-CACHE-INV-LABEL: local_singlethread_release_store:
561; SKIP-CACHE-INV:       ; %bb.0: ; %entry
562; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
563; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
564; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
566; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
567; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
568; SKIP-CACHE-INV-NEXT:    s_endpgm
569;
570; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store:
571; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
572; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
573; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
575; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
576; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
577; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
578;
579; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store:
580; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
581; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
582; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
585; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
586; GFX90A-TGSPLIT-NEXT:    s_endpgm
587;
588;
589    i32 %in, i32 addrspace(3)* %out) {
590entry:
591  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4
592  ret void
593}
594
595define amdgpu_kernel void @local_singlethread_seq_cst_store(
596; GFX6-LABEL: local_singlethread_seq_cst_store:
597; GFX6:       ; %bb.0: ; %entry
598; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
599; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
600; GFX6-NEXT:    s_mov_b32 m0, -1
601; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX6-NEXT:    v_mov_b32_e32 v1, s0
603; GFX6-NEXT:    v_mov_b32_e32 v0, s1
604; GFX6-NEXT:    ds_write_b32 v0, v1
605; GFX6-NEXT:    s_endpgm
606;
607; GFX7-LABEL: local_singlethread_seq_cst_store:
608; GFX7:       ; %bb.0: ; %entry
609; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
610; GFX7-NEXT:    s_mov_b32 m0, -1
611; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX7-NEXT:    v_mov_b32_e32 v0, s1
613; GFX7-NEXT:    v_mov_b32_e32 v1, s0
614; GFX7-NEXT:    ds_write_b32 v0, v1
615; GFX7-NEXT:    s_endpgm
616;
617; GFX10-WGP-LABEL: local_singlethread_seq_cst_store:
618; GFX10-WGP:       ; %bb.0: ; %entry
619; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
620; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
622; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
623; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
624; GFX10-WGP-NEXT:    s_endpgm
625;
626; GFX10-CU-LABEL: local_singlethread_seq_cst_store:
627; GFX10-CU:       ; %bb.0: ; %entry
628; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
629; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
631; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
632; GFX10-CU-NEXT:    ds_write_b32 v0, v1
633; GFX10-CU-NEXT:    s_endpgm
634;
635; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_store:
636; SKIP-CACHE-INV:       ; %bb.0: ; %entry
637; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
638; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
639; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
640; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
641; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
642; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
643; SKIP-CACHE-INV-NEXT:    s_endpgm
644;
645; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store:
646; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
647; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
648; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
650; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
651; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
652; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
653;
654; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store:
655; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
656; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
657; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
658; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
659; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
660; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
661; GFX90A-TGSPLIT-NEXT:    s_endpgm
662;
663;
664    i32 %in, i32 addrspace(3)* %out) {
665entry:
666  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4
667  ret void
668}
669
670define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
671; GFX6-LABEL: local_singlethread_monotonic_atomicrmw:
672; GFX6:       ; %bb.0: ; %entry
673; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
674; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
675; GFX6-NEXT:    s_mov_b32 m0, -1
676; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX6-NEXT:    v_mov_b32_e32 v0, s0
678; GFX6-NEXT:    v_mov_b32_e32 v1, s1
679; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
680; GFX6-NEXT:    s_endpgm
681;
682; GFX7-LABEL: local_singlethread_monotonic_atomicrmw:
683; GFX7:       ; %bb.0: ; %entry
684; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
685; GFX7-NEXT:    s_mov_b32 m0, -1
686; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX7-NEXT:    v_mov_b32_e32 v0, s0
688; GFX7-NEXT:    v_mov_b32_e32 v1, s1
689; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
690; GFX7-NEXT:    s_endpgm
691;
692; GFX10-WGP-LABEL: local_singlethread_monotonic_atomicrmw:
693; GFX10-WGP:       ; %bb.0: ; %entry
694; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
695; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
697; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
698; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
699; GFX10-WGP-NEXT:    s_endpgm
700;
701; GFX10-CU-LABEL: local_singlethread_monotonic_atomicrmw:
702; GFX10-CU:       ; %bb.0: ; %entry
703; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
704; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
706; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
707; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
708; GFX10-CU-NEXT:    s_endpgm
709;
710; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_atomicrmw:
711; SKIP-CACHE-INV:       ; %bb.0: ; %entry
712; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
713; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
714; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
716; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
717; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
718; SKIP-CACHE-INV-NEXT:    s_endpgm
719;
720; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
721; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
722; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
723; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
725; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
726; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
727; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
728;
729; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
730; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
731; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
732; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
734; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
735; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
736; GFX90A-TGSPLIT-NEXT:    s_endpgm
737;
738;
739    i32 addrspace(3)* %out, i32 %in) {
740entry:
741  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic
742  ret void
743}
744
745define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
746; GFX6-LABEL: local_singlethread_acquire_atomicrmw:
747; GFX6:       ; %bb.0: ; %entry
748; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
749; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
750; GFX6-NEXT:    s_mov_b32 m0, -1
751; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
752; GFX6-NEXT:    v_mov_b32_e32 v0, s0
753; GFX6-NEXT:    v_mov_b32_e32 v1, s1
754; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
755; GFX6-NEXT:    s_endpgm
756;
757; GFX7-LABEL: local_singlethread_acquire_atomicrmw:
758; GFX7:       ; %bb.0: ; %entry
759; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
760; GFX7-NEXT:    s_mov_b32 m0, -1
761; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX7-NEXT:    v_mov_b32_e32 v0, s0
763; GFX7-NEXT:    v_mov_b32_e32 v1, s1
764; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
765; GFX7-NEXT:    s_endpgm
766;
767; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw:
768; GFX10-WGP:       ; %bb.0: ; %entry
769; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
770; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
772; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
773; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
774; GFX10-WGP-NEXT:    s_endpgm
775;
776; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw:
777; GFX10-CU:       ; %bb.0: ; %entry
778; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
779; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
781; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
782; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
783; GFX10-CU-NEXT:    s_endpgm
784;
785; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_atomicrmw:
786; SKIP-CACHE-INV:       ; %bb.0: ; %entry
787; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
788; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
789; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
790; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
791; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
792; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
793; SKIP-CACHE-INV-NEXT:    s_endpgm
794;
795; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
796; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
797; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
798; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
800; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
801; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
802; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
803;
804; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
805; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
806; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
807; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
808; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
809; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
810; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
811; GFX90A-TGSPLIT-NEXT:    s_endpgm
812;
813;
814    i32 addrspace(3)* %out, i32 %in) {
815entry:
816  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
817  ret void
818}
819
820define amdgpu_kernel void @local_singlethread_release_atomicrmw(
821; GFX6-LABEL: local_singlethread_release_atomicrmw:
822; GFX6:       ; %bb.0: ; %entry
823; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
824; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
825; GFX6-NEXT:    s_mov_b32 m0, -1
826; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX6-NEXT:    v_mov_b32_e32 v0, s0
828; GFX6-NEXT:    v_mov_b32_e32 v1, s1
829; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
830; GFX6-NEXT:    s_endpgm
831;
832; GFX7-LABEL: local_singlethread_release_atomicrmw:
833; GFX7:       ; %bb.0: ; %entry
834; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
835; GFX7-NEXT:    s_mov_b32 m0, -1
836; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
837; GFX7-NEXT:    v_mov_b32_e32 v0, s0
838; GFX7-NEXT:    v_mov_b32_e32 v1, s1
839; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
840; GFX7-NEXT:    s_endpgm
841;
842; GFX10-WGP-LABEL: local_singlethread_release_atomicrmw:
843; GFX10-WGP:       ; %bb.0: ; %entry
844; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
845; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
847; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
848; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
849; GFX10-WGP-NEXT:    s_endpgm
850;
851; GFX10-CU-LABEL: local_singlethread_release_atomicrmw:
852; GFX10-CU:       ; %bb.0: ; %entry
853; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
854; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
856; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
857; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
858; GFX10-CU-NEXT:    s_endpgm
859;
860; SKIP-CACHE-INV-LABEL: local_singlethread_release_atomicrmw:
861; SKIP-CACHE-INV:       ; %bb.0: ; %entry
862; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
863; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
864; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
865; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
866; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
867; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
868; SKIP-CACHE-INV-NEXT:    s_endpgm
869;
870; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw:
871; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
872; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
873; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
875; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
876; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
877; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
878;
879; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw:
880; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
881; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
882; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
884; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
885; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
886; GFX90A-TGSPLIT-NEXT:    s_endpgm
887;
888;
889    i32 addrspace(3)* %out, i32 %in) {
890entry:
891  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release
892  ret void
893}
894
895define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
896; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw:
897; GFX6:       ; %bb.0: ; %entry
898; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
899; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
900; GFX6-NEXT:    s_mov_b32 m0, -1
901; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX6-NEXT:    v_mov_b32_e32 v0, s0
903; GFX6-NEXT:    v_mov_b32_e32 v1, s1
904; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
905; GFX6-NEXT:    s_endpgm
906;
907; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw:
908; GFX7:       ; %bb.0: ; %entry
909; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
910; GFX7-NEXT:    s_mov_b32 m0, -1
911; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX7-NEXT:    v_mov_b32_e32 v0, s0
913; GFX7-NEXT:    v_mov_b32_e32 v1, s1
914; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
915; GFX7-NEXT:    s_endpgm
916;
917; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw:
918; GFX10-WGP:       ; %bb.0: ; %entry
919; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
920; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
922; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
923; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
924; GFX10-WGP-NEXT:    s_endpgm
925;
926; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw:
927; GFX10-CU:       ; %bb.0: ; %entry
928; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
929; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
931; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
932; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
933; GFX10-CU-NEXT:    s_endpgm
934;
935; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_atomicrmw:
936; SKIP-CACHE-INV:       ; %bb.0: ; %entry
937; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
938; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
939; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
942; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
943; SKIP-CACHE-INV-NEXT:    s_endpgm
944;
945; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
946; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
947; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
948; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
949; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
950; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
951; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
952; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
953;
954; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
955; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
956; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
957; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
959; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
960; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
961; GFX90A-TGSPLIT-NEXT:    s_endpgm
962;
963;
964    i32 addrspace(3)* %out, i32 %in) {
965entry:
966  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
967  ret void
968}
969
970define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
971; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw:
972; GFX6:       ; %bb.0: ; %entry
973; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
974; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
975; GFX6-NEXT:    s_mov_b32 m0, -1
976; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX6-NEXT:    v_mov_b32_e32 v0, s0
978; GFX6-NEXT:    v_mov_b32_e32 v1, s1
979; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
980; GFX6-NEXT:    s_endpgm
981;
982; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw:
983; GFX7:       ; %bb.0: ; %entry
984; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
985; GFX7-NEXT:    s_mov_b32 m0, -1
986; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX7-NEXT:    v_mov_b32_e32 v0, s0
988; GFX7-NEXT:    v_mov_b32_e32 v1, s1
989; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
990; GFX7-NEXT:    s_endpgm
991;
992; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw:
993; GFX10-WGP:       ; %bb.0: ; %entry
994; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
995; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
996; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
997; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
998; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
999; GFX10-WGP-NEXT:    s_endpgm
1000;
1001; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw:
1002; GFX10-CU:       ; %bb.0: ; %entry
1003; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1004; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1005; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1006; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1007; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1008; GFX10-CU-NEXT:    s_endpgm
1009;
1010; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_atomicrmw:
1011; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1012; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1013; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1014; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1015; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1016; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1017; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1018; SKIP-CACHE-INV-NEXT:    s_endpgm
1019;
1020; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
1021; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1022; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1023; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1025; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1026; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1027; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1028;
1029; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
1030; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1031; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1032; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1033; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1034; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1035; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
1036; GFX90A-TGSPLIT-NEXT:    s_endpgm
1037;
1038;
1039    i32 addrspace(3)* %out, i32 %in) {
1040entry:
1041  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
1042  ret void
1043}
1044
1045define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
1046; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw:
1047; GFX6:       ; %bb.0: ; %entry
1048; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1049; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1050; GFX6-NEXT:    s_mov_b32 m0, -1
1051; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1053; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1054; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1055; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1056; GFX6-NEXT:    ds_write_b32 v0, v1
1057; GFX6-NEXT:    s_endpgm
1058;
1059; GFX7-LABEL: local_singlethread_acquire_ret_atomicrmw:
1060; GFX7:       ; %bb.0: ; %entry
1061; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1062; GFX7-NEXT:    s_mov_b32 m0, -1
1063; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1065; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1066; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1068; GFX7-NEXT:    ds_write_b32 v0, v1
1069; GFX7-NEXT:    s_endpgm
1070;
1071; GFX10-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw:
1072; GFX10-WGP:       ; %bb.0: ; %entry
1073; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1074; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1076; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1077; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1078; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1079; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1080; GFX10-WGP-NEXT:    s_endpgm
1081;
1082; GFX10-CU-LABEL: local_singlethread_acquire_ret_atomicrmw:
1083; GFX10-CU:       ; %bb.0: ; %entry
1084; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1085; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1087; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1088; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1089; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1091; GFX10-CU-NEXT:    s_endpgm
1092;
1093; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_ret_atomicrmw:
1094; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1095; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1096; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1097; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1099; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1100; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1101; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1102; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1103; SKIP-CACHE-INV-NEXT:    s_endpgm
1104;
1105; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
1106; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1107; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1110; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1111; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1114; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1115;
1116; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
1117; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1119; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1120; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1121; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1122; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1123; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1125; GFX90A-TGSPLIT-NEXT:    s_endpgm
1126;
1127;
1128    i32 addrspace(3)* %out, i32 %in) {
1129entry:
1130  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
1131  store i32 %val, i32 addrspace(3)* %out, align 4
1132  ret void
1133}
1134
1135define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
1136; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1137; GFX6:       ; %bb.0: ; %entry
1138; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1139; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1140; GFX6-NEXT:    s_mov_b32 m0, -1
1141; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1142; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1143; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1144; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1145; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1146; GFX6-NEXT:    ds_write_b32 v0, v1
1147; GFX6-NEXT:    s_endpgm
1148;
1149; GFX7-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1150; GFX7:       ; %bb.0: ; %entry
1151; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1152; GFX7-NEXT:    s_mov_b32 m0, -1
1153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1155; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1156; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1157; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX7-NEXT:    ds_write_b32 v0, v1
1159; GFX7-NEXT:    s_endpgm
1160;
1161; GFX10-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1162; GFX10-WGP:       ; %bb.0: ; %entry
1163; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1164; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1166; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1167; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1168; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1170; GFX10-WGP-NEXT:    s_endpgm
1171;
1172; GFX10-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1173; GFX10-CU:       ; %bb.0: ; %entry
1174; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1175; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1176; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1177; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1178; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1179; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1181; GFX10-CU-NEXT:    s_endpgm
1182;
1183; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1184; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1185; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1186; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1187; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1190; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1191; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1192; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1193; SKIP-CACHE-INV-NEXT:    s_endpgm
1194;
1195; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1196; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1198; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1200; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1201; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1204; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1205;
1206; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
1207; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1208; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1209; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1211; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1212; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1213; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1215; GFX90A-TGSPLIT-NEXT:    s_endpgm
1216;
1217;
1218    i32 addrspace(3)* %out, i32 %in) {
1219entry:
1220  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
1221  store i32 %val, i32 addrspace(3)* %out, align 4
1222  ret void
1223}
1224
1225define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
1226; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1227; GFX6:       ; %bb.0: ; %entry
1228; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
1229; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
1230; GFX6-NEXT:    s_mov_b32 m0, -1
1231; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1232; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1233; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1234; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1235; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX6-NEXT:    ds_write_b32 v0, v1
1237; GFX6-NEXT:    s_endpgm
1238;
1239; GFX7-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1240; GFX7:       ; %bb.0: ; %entry
1241; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1242; GFX7-NEXT:    s_mov_b32 m0, -1
1243; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1245; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1246; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1247; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1248; GFX7-NEXT:    ds_write_b32 v0, v1
1249; GFX7-NEXT:    s_endpgm
1250;
1251; GFX10-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1252; GFX10-WGP:       ; %bb.0: ; %entry
1253; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1254; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1255; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1256; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1257; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1258; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
1260; GFX10-WGP-NEXT:    s_endpgm
1261;
1262; GFX10-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1263; GFX10-CU:       ; %bb.0: ; %entry
1264; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1265; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1267; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1268; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1269; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX10-CU-NEXT:    ds_write_b32 v0, v1
1271; GFX10-CU-NEXT:    s_endpgm
1272;
1273; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1274; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1275; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1276; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1277; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1278; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1279; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1280; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1281; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1282; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
1283; SKIP-CACHE-INV-NEXT:    s_endpgm
1284;
1285; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1286; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1288; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1290; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1291; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1292; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
1294; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1295;
1296; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
1297; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1298; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1299; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1301; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
1302; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
1303; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
1305; GFX90A-TGSPLIT-NEXT:    s_endpgm
1306;
1307;
1308    i32 addrspace(3)* %out, i32 %in) {
1309entry:
1310  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
1311  store i32 %val, i32 addrspace(3)* %out, align 4
1312  ret void
1313}
1314
1315define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
1316; GFX6-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1317; GFX6:       ; %bb.0: ; %entry
1318; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1319; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1320; GFX6-NEXT:    s_mov_b32 m0, -1
1321; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1323; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1324; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1325; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1326; GFX6-NEXT:    s_endpgm
1327;
1328; GFX7-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1329; GFX7:       ; %bb.0: ; %entry
1330; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1331; GFX7-NEXT:    s_mov_b32 m0, -1
1332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1334; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1335; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1336; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1337; GFX7-NEXT:    s_endpgm
1338;
1339; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1340; GFX10-WGP:       ; %bb.0: ; %entry
1341; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1342; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1344; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1345; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1346; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1347; GFX10-WGP-NEXT:    s_endpgm
1348;
1349; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1350; GFX10-CU:       ; %bb.0: ; %entry
1351; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1352; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1354; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1355; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1356; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1357; GFX10-CU-NEXT:    s_endpgm
1358;
1359; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1360; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1361; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1362; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1363; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1364; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1365; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1366; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1367; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1368; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1369; SKIP-CACHE-INV-NEXT:    s_endpgm
1370;
1371; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1372; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1373; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1374; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1376; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1377; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1378; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1379; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1380;
1381; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
1382; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1383; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1384; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1386; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1387; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1388; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1389; GFX90A-TGSPLIT-NEXT:    s_endpgm
1390;
1391;
1392    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1393entry:
1394  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1395  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
1396  ret void
1397}
1398
1399define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
1400; GFX6-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1401; GFX6:       ; %bb.0: ; %entry
1402; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1403; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1404; GFX6-NEXT:    s_mov_b32 m0, -1
1405; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1406; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1407; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1408; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1409; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1410; GFX6-NEXT:    s_endpgm
1411;
1412; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1413; GFX7:       ; %bb.0: ; %entry
1414; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1415; GFX7-NEXT:    s_mov_b32 m0, -1
1416; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1418; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1419; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1420; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1421; GFX7-NEXT:    s_endpgm
1422;
1423; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1424; GFX10-WGP:       ; %bb.0: ; %entry
1425; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1426; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1428; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1429; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1430; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1431; GFX10-WGP-NEXT:    s_endpgm
1432;
1433; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1434; GFX10-CU:       ; %bb.0: ; %entry
1435; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1436; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1438; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1439; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1440; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1441; GFX10-CU-NEXT:    s_endpgm
1442;
1443; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1444; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1445; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1446; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1447; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1448; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1450; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1451; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1452; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1453; SKIP-CACHE-INV-NEXT:    s_endpgm
1454;
1455; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1456; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1457; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1458; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1460; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1461; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1462; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1463; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1464;
1465; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
1466; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1467; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1468; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1469; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1470; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1471; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1472; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1473; GFX90A-TGSPLIT-NEXT:    s_endpgm
1474;
1475;
1476    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1477entry:
1478  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1479  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
1480  ret void
1481}
1482
1483define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
1484; GFX6-LABEL: local_singlethread_release_monotonic_cmpxchg:
1485; GFX6:       ; %bb.0: ; %entry
1486; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1487; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1488; GFX6-NEXT:    s_mov_b32 m0, -1
1489; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1491; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1492; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1493; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1494; GFX6-NEXT:    s_endpgm
1495;
1496; GFX7-LABEL: local_singlethread_release_monotonic_cmpxchg:
1497; GFX7:       ; %bb.0: ; %entry
1498; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1499; GFX7-NEXT:    s_mov_b32 m0, -1
1500; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1501; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1502; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1503; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1504; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1505; GFX7-NEXT:    s_endpgm
1506;
1507; GFX10-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
1508; GFX10-WGP:       ; %bb.0: ; %entry
1509; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1510; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1512; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1513; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1514; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1515; GFX10-WGP-NEXT:    s_endpgm
1516;
1517; GFX10-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
1518; GFX10-CU:       ; %bb.0: ; %entry
1519; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1520; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1521; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1522; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1523; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1524; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1525; GFX10-CU-NEXT:    s_endpgm
1526;
1527; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_cmpxchg:
1528; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1529; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1530; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1531; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1532; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1536; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1537; SKIP-CACHE-INV-NEXT:    s_endpgm
1538;
1539; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
1540; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1541; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1542; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1544; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1545; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1546; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1547; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1548;
1549; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
1550; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1551; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1552; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1554; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1555; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1556; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1557; GFX90A-TGSPLIT-NEXT:    s_endpgm
1558;
1559;
1560    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1561entry:
1562  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1563  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
1564  ret void
1565}
1566
1567define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
1568; GFX6-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1569; GFX6:       ; %bb.0: ; %entry
1570; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1571; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1572; GFX6-NEXT:    s_mov_b32 m0, -1
1573; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1575; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1576; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1577; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1578; GFX6-NEXT:    s_endpgm
1579;
1580; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1581; GFX7:       ; %bb.0: ; %entry
1582; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1583; GFX7-NEXT:    s_mov_b32 m0, -1
1584; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1586; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1587; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1588; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1589; GFX7-NEXT:    s_endpgm
1590;
1591; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1592; GFX10-WGP:       ; %bb.0: ; %entry
1593; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1594; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1596; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1597; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1598; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1599; GFX10-WGP-NEXT:    s_endpgm
1600;
1601; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1602; GFX10-CU:       ; %bb.0: ; %entry
1603; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1604; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1605; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1606; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1607; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1608; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1609; GFX10-CU-NEXT:    s_endpgm
1610;
1611; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1612; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1613; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1614; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1615; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1616; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1619; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1620; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1621; SKIP-CACHE-INV-NEXT:    s_endpgm
1622;
1623; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1624; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1625; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1626; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1627; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1628; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1629; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1630; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1631; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1632;
1633; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
1634; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1635; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1636; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1637; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1638; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1639; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1640; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1641; GFX90A-TGSPLIT-NEXT:    s_endpgm
1642;
1643;
1644    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1645entry:
1646  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1647  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
1648  ret void
1649}
1650
1651define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
1652; GFX6-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1653; GFX6:       ; %bb.0: ; %entry
1654; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1655; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1656; GFX6-NEXT:    s_mov_b32 m0, -1
1657; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1658; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1659; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1660; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1661; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1662; GFX6-NEXT:    s_endpgm
1663;
1664; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1665; GFX7:       ; %bb.0: ; %entry
1666; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1667; GFX7-NEXT:    s_mov_b32 m0, -1
1668; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1670; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1671; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1672; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1673; GFX7-NEXT:    s_endpgm
1674;
1675; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1676; GFX10-WGP:       ; %bb.0: ; %entry
1677; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1678; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1680; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1681; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1682; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1683; GFX10-WGP-NEXT:    s_endpgm
1684;
1685; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1686; GFX10-CU:       ; %bb.0: ; %entry
1687; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1688; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1690; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1691; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1692; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1693; GFX10-CU-NEXT:    s_endpgm
1694;
1695; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1696; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1697; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1698; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1699; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1700; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1701; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1702; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1703; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1704; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1705; SKIP-CACHE-INV-NEXT:    s_endpgm
1706;
1707; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1708; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1709; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1710; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1712; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1713; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1714; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1715; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1716;
1717; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
1718; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1719; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1720; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1721; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1722; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1723; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1724; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1725; GFX90A-TGSPLIT-NEXT:    s_endpgm
1726;
1727;
1728    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1729entry:
1730  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1731  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
1732  ret void
1733}
1734
1735define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
1736; GFX6-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1737; GFX6:       ; %bb.0: ; %entry
1738; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1739; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1740; GFX6-NEXT:    s_mov_b32 m0, -1
1741; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1742; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1743; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1744; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1745; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1746; GFX6-NEXT:    s_endpgm
1747;
1748; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1749; GFX7:       ; %bb.0: ; %entry
1750; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1751; GFX7-NEXT:    s_mov_b32 m0, -1
1752; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1754; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1755; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1756; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1757; GFX7-NEXT:    s_endpgm
1758;
1759; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1760; GFX10-WGP:       ; %bb.0: ; %entry
1761; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1762; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1764; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1765; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1766; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1767; GFX10-WGP-NEXT:    s_endpgm
1768;
1769; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1770; GFX10-CU:       ; %bb.0: ; %entry
1771; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1772; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1774; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1775; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1776; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1777; GFX10-CU-NEXT:    s_endpgm
1778;
1779; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1780; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1781; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1782; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1783; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1784; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1787; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1788; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1789; SKIP-CACHE-INV-NEXT:    s_endpgm
1790;
1791; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1792; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1793; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1794; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1796; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1797; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1798; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1799; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1800;
1801; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
1802; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1803; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1804; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1806; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1808; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1809; GFX90A-TGSPLIT-NEXT:    s_endpgm
1810;
1811;
1812    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1813entry:
1814  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1815  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
1816  ret void
1817}
1818
1819define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
1820; GFX6-LABEL: local_singlethread_release_acquire_cmpxchg:
1821; GFX6:       ; %bb.0: ; %entry
1822; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1823; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1824; GFX6-NEXT:    s_mov_b32 m0, -1
1825; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1827; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1828; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1829; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1830; GFX6-NEXT:    s_endpgm
1831;
1832; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg:
1833; GFX7:       ; %bb.0: ; %entry
1834; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1835; GFX7-NEXT:    s_mov_b32 m0, -1
1836; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1837; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1838; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1839; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1840; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1841; GFX7-NEXT:    s_endpgm
1842;
1843; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
1844; GFX10-WGP:       ; %bb.0: ; %entry
1845; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1846; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1847; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1848; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1849; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1850; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1851; GFX10-WGP-NEXT:    s_endpgm
1852;
1853; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
1854; GFX10-CU:       ; %bb.0: ; %entry
1855; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1856; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1857; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1858; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1859; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1860; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1861; GFX10-CU-NEXT:    s_endpgm
1862;
1863; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_cmpxchg:
1864; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1865; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1866; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1867; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1868; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1870; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1871; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1872; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1873; SKIP-CACHE-INV-NEXT:    s_endpgm
1874;
1875; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
1876; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1877; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1878; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1879; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1880; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1881; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1882; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1883; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1884;
1885; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
1886; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1887; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1888; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1890; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1891; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1892; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1893; GFX90A-TGSPLIT-NEXT:    s_endpgm
1894;
1895;
1896    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1897entry:
1898  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1899  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
1900  ret void
1901}
1902
1903define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
1904; GFX6-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1905; GFX6:       ; %bb.0: ; %entry
1906; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1907; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1908; GFX6-NEXT:    s_mov_b32 m0, -1
1909; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1911; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1912; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1913; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1914; GFX6-NEXT:    s_endpgm
1915;
1916; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1917; GFX7:       ; %bb.0: ; %entry
1918; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1919; GFX7-NEXT:    s_mov_b32 m0, -1
1920; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1922; GFX7-NEXT:    v_mov_b32_e32 v1, s2
1923; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1924; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1925; GFX7-NEXT:    s_endpgm
1926;
1927; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1928; GFX10-WGP:       ; %bb.0: ; %entry
1929; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1930; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1931; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1932; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1933; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
1934; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1935; GFX10-WGP-NEXT:    s_endpgm
1936;
1937; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1938; GFX10-CU:       ; %bb.0: ; %entry
1939; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1940; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1941; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1942; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1943; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
1944; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1945; GFX10-CU-NEXT:    s_endpgm
1946;
1947; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1948; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1949; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1950; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1951; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
1952; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1953; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1954; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
1955; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
1956; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1957; SKIP-CACHE-INV-NEXT:    s_endpgm
1958;
1959; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1960; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1961; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1962; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1963; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1964; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1965; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1966; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1967; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1968;
1969; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
1970; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1971; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1972; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
1974; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1975; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
1976; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1977; GFX90A-TGSPLIT-NEXT:    s_endpgm
1978;
1979;
1980    i32 addrspace(3)* %out, i32 %in, i32 %old) {
1981entry:
1982  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
1983  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
1984  ret void
1985}
1986
1987define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
1988; GFX6-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
1989; GFX6:       ; %bb.0: ; %entry
1990; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
1991; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
1992; GFX6-NEXT:    s_mov_b32 m0, -1
1993; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1995; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1996; GFX6-NEXT:    v_mov_b32_e32 v2, s0
1997; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
1998; GFX6-NEXT:    s_endpgm
1999;
2000; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2001; GFX7:       ; %bb.0: ; %entry
2002; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2003; GFX7-NEXT:    s_mov_b32 m0, -1
2004; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2005; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2006; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2007; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2008; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2009; GFX7-NEXT:    s_endpgm
2010;
2011; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2012; GFX10-WGP:       ; %bb.0: ; %entry
2013; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2014; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2016; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2017; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2018; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2019; GFX10-WGP-NEXT:    s_endpgm
2020;
2021; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2022; GFX10-CU:       ; %bb.0: ; %entry
2023; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2024; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2026; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2027; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2028; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2029; GFX10-CU-NEXT:    s_endpgm
2030;
2031; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2032; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2033; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2034; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2035; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2036; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2037; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2038; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2039; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2040; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2041; SKIP-CACHE-INV-NEXT:    s_endpgm
2042;
2043; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2044; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2045; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2046; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2047; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2048; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2049; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2050; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2051; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2052;
2053; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
2054; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2055; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2056; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2057; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2058; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2059; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2060; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2061; GFX90A-TGSPLIT-NEXT:    s_endpgm
2062;
2063;
2064    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2065entry:
2066  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2067  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
2068  ret void
2069}
2070
2071define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
2072; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2073; GFX6:       ; %bb.0: ; %entry
2074; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2075; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2076; GFX6-NEXT:    s_mov_b32 m0, -1
2077; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2078; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2079; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2080; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2081; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2082; GFX6-NEXT:    s_endpgm
2083;
2084; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2085; GFX7:       ; %bb.0: ; %entry
2086; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2087; GFX7-NEXT:    s_mov_b32 m0, -1
2088; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2089; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2090; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2091; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2092; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2093; GFX7-NEXT:    s_endpgm
2094;
2095; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2096; GFX10-WGP:       ; %bb.0: ; %entry
2097; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2098; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2099; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2100; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2101; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2102; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2103; GFX10-WGP-NEXT:    s_endpgm
2104;
2105; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2106; GFX10-CU:       ; %bb.0: ; %entry
2107; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2108; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2109; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2110; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2111; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2112; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2113; GFX10-CU-NEXT:    s_endpgm
2114;
2115; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2116; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2117; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2118; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2119; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2120; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2122; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2123; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2124; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2125; SKIP-CACHE-INV-NEXT:    s_endpgm
2126;
2127; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2128; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2129; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2130; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2131; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2132; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2133; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2134; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2135; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2136;
2137; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
2138; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2139; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2140; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2141; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2142; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2143; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2144; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
2145; GFX90A-TGSPLIT-NEXT:    s_endpgm
2146;
2147;
2148    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2149entry:
2150  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2151  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
2152  ret void
2153}
2154
2155define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
2156; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2157; GFX6:       ; %bb.0: ; %entry
2158; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2159; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2160; GFX6-NEXT:    s_mov_b32 m0, -1
2161; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2162; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2163; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2164; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2165; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2166; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2167; GFX6-NEXT:    ds_write_b32 v0, v1
2168; GFX6-NEXT:    s_endpgm
2169;
2170; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2171; GFX7:       ; %bb.0: ; %entry
2172; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2173; GFX7-NEXT:    s_mov_b32 m0, -1
2174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2175; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2176; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2177; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2178; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2179; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2180; GFX7-NEXT:    ds_write_b32 v0, v1
2181; GFX7-NEXT:    s_endpgm
2182;
2183; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2184; GFX10-WGP:       ; %bb.0: ; %entry
2185; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2186; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2187; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2188; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2189; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2190; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2191; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2192; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2193; GFX10-WGP-NEXT:    s_endpgm
2194;
2195; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2196; GFX10-CU:       ; %bb.0: ; %entry
2197; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2198; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2200; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2201; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2202; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2203; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2204; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2205; GFX10-CU-NEXT:    s_endpgm
2206;
2207; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2208; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2209; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2210; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2211; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2212; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2216; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2217; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2218; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2219; SKIP-CACHE-INV-NEXT:    s_endpgm
2220;
2221; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2222; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2223; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2224; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2225; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2226; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2227; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2228; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2229; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2230; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2231; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2232;
2233; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
2234; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2235; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2236; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2238; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2239; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2240; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2241; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2243; GFX90A-TGSPLIT-NEXT:    s_endpgm
2244;
2245;
2246    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2247entry:
2248  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2249  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
2250  %val0 = extractvalue { i32, i1 } %val, 0
2251  store i32 %val0, i32 addrspace(3)* %out, align 4
2252  ret void
2253}
2254
2255define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
2256; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2257; GFX6:       ; %bb.0: ; %entry
2258; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2259; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2260; GFX6-NEXT:    s_mov_b32 m0, -1
2261; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2262; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2263; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2264; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2265; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2266; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2267; GFX6-NEXT:    ds_write_b32 v0, v1
2268; GFX6-NEXT:    s_endpgm
2269;
2270; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2271; GFX7:       ; %bb.0: ; %entry
2272; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2273; GFX7-NEXT:    s_mov_b32 m0, -1
2274; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2275; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2276; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2277; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2278; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2279; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2280; GFX7-NEXT:    ds_write_b32 v0, v1
2281; GFX7-NEXT:    s_endpgm
2282;
2283; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2284; GFX10-WGP:       ; %bb.0: ; %entry
2285; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2286; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2287; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2288; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2289; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2290; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2291; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2293; GFX10-WGP-NEXT:    s_endpgm
2294;
2295; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2296; GFX10-CU:       ; %bb.0: ; %entry
2297; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2298; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2300; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2301; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2302; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2303; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2304; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2305; GFX10-CU-NEXT:    s_endpgm
2306;
2307; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2308; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2309; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2310; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2311; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2312; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2313; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2314; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2316; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2317; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2318; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2319; SKIP-CACHE-INV-NEXT:    s_endpgm
2320;
2321; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2322; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2323; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2324; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2325; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2326; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2327; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2328; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2329; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2331; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2332;
2333; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
2334; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2335; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2336; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2337; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2338; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2339; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2340; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2341; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2342; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2343; GFX90A-TGSPLIT-NEXT:    s_endpgm
2344;
2345;
2346    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2347entry:
2348  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2349  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
2350  %val0 = extractvalue { i32, i1 } %val, 0
2351  store i32 %val0, i32 addrspace(3)* %out, align 4
2352  ret void
2353}
2354
2355define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
2356; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2357; GFX6:       ; %bb.0: ; %entry
2358; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2359; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2360; GFX6-NEXT:    s_mov_b32 m0, -1
2361; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2362; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2363; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2364; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2365; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2366; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2367; GFX6-NEXT:    ds_write_b32 v0, v1
2368; GFX6-NEXT:    s_endpgm
2369;
2370; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2371; GFX7:       ; %bb.0: ; %entry
2372; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2373; GFX7-NEXT:    s_mov_b32 m0, -1
2374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2375; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2376; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2377; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2378; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2379; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2380; GFX7-NEXT:    ds_write_b32 v0, v1
2381; GFX7-NEXT:    s_endpgm
2382;
2383; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2384; GFX10-WGP:       ; %bb.0: ; %entry
2385; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2386; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2387; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2388; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2389; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2390; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2391; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2392; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2393; GFX10-WGP-NEXT:    s_endpgm
2394;
2395; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2396; GFX10-CU:       ; %bb.0: ; %entry
2397; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2398; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2399; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2400; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2401; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2402; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2403; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2404; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2405; GFX10-CU-NEXT:    s_endpgm
2406;
2407; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2408; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2409; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2410; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2411; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2412; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2413; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2414; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2415; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2416; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2417; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2418; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2419; SKIP-CACHE-INV-NEXT:    s_endpgm
2420;
2421; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2422; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2423; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2424; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2426; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2427; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2428; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2429; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2430; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2431; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2432;
2433; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
2434; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2435; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2436; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2437; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2438; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2439; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2440; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2441; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2442; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2443; GFX90A-TGSPLIT-NEXT:    s_endpgm
2444;
2445;
2446    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2447entry:
2448  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2449  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
2450  %val0 = extractvalue { i32, i1 } %val, 0
2451  store i32 %val0, i32 addrspace(3)* %out, align 4
2452  ret void
2453}
2454
2455define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
2456; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2457; GFX6:       ; %bb.0: ; %entry
2458; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2459; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2460; GFX6-NEXT:    s_mov_b32 m0, -1
2461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2462; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2463; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2464; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2465; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2466; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2467; GFX6-NEXT:    ds_write_b32 v0, v1
2468; GFX6-NEXT:    s_endpgm
2469;
2470; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2471; GFX7:       ; %bb.0: ; %entry
2472; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2473; GFX7-NEXT:    s_mov_b32 m0, -1
2474; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2476; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2477; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2478; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2479; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2480; GFX7-NEXT:    ds_write_b32 v0, v1
2481; GFX7-NEXT:    s_endpgm
2482;
2483; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2484; GFX10-WGP:       ; %bb.0: ; %entry
2485; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2486; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2487; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2488; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2489; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2490; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2491; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2492; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2493; GFX10-WGP-NEXT:    s_endpgm
2494;
2495; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2496; GFX10-CU:       ; %bb.0: ; %entry
2497; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2498; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2499; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2500; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2501; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2502; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2503; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2504; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2505; GFX10-CU-NEXT:    s_endpgm
2506;
2507; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2508; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2509; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2510; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2511; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2512; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2516; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2517; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2518; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2519; SKIP-CACHE-INV-NEXT:    s_endpgm
2520;
2521; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2522; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2523; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2524; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2525; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2526; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2527; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2528; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2529; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2530; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2531; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2532;
2533; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
2534; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2535; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2536; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2537; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2538; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2539; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2540; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2541; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2543; GFX90A-TGSPLIT-NEXT:    s_endpgm
2544;
2545;
2546    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2547entry:
2548  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2549  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
2550  %val0 = extractvalue { i32, i1 } %val, 0
2551  store i32 %val0, i32 addrspace(3)* %out, align 4
2552  ret void
2553}
2554
2555define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
2556; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2557; GFX6:       ; %bb.0: ; %entry
2558; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2559; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2560; GFX6-NEXT:    s_mov_b32 m0, -1
2561; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2563; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2564; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2565; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2566; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2567; GFX6-NEXT:    ds_write_b32 v0, v1
2568; GFX6-NEXT:    s_endpgm
2569;
2570; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2571; GFX7:       ; %bb.0: ; %entry
2572; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2573; GFX7-NEXT:    s_mov_b32 m0, -1
2574; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2575; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2576; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2577; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2578; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2579; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2580; GFX7-NEXT:    ds_write_b32 v0, v1
2581; GFX7-NEXT:    s_endpgm
2582;
2583; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2584; GFX10-WGP:       ; %bb.0: ; %entry
2585; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2586; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2587; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2588; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2589; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2590; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2591; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2592; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2593; GFX10-WGP-NEXT:    s_endpgm
2594;
2595; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2596; GFX10-CU:       ; %bb.0: ; %entry
2597; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2598; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2599; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2600; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2601; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2602; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2603; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2605; GFX10-CU-NEXT:    s_endpgm
2606;
2607; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2608; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2609; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2610; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2611; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2612; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2613; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2616; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2617; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2618; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2619; SKIP-CACHE-INV-NEXT:    s_endpgm
2620;
2621; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2622; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2623; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2624; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2625; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2626; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2627; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2628; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2629; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2630; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2631; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2632;
2633; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
2634; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2635; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2636; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2637; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2638; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2639; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2640; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2641; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2642; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2643; GFX90A-TGSPLIT-NEXT:    s_endpgm
2644;
2645;
2646    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2647entry:
2648  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2649  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
2650  %val0 = extractvalue { i32, i1 } %val, 0
2651  store i32 %val0, i32 addrspace(3)* %out, align 4
2652  ret void
2653}
2654
2655define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
2656; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2657; GFX6:       ; %bb.0: ; %entry
2658; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2659; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2660; GFX6-NEXT:    s_mov_b32 m0, -1
2661; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2662; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2663; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2664; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2665; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2666; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2667; GFX6-NEXT:    ds_write_b32 v0, v1
2668; GFX6-NEXT:    s_endpgm
2669;
2670; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2671; GFX7:       ; %bb.0: ; %entry
2672; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2673; GFX7-NEXT:    s_mov_b32 m0, -1
2674; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2675; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2676; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2677; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2678; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2679; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2680; GFX7-NEXT:    ds_write_b32 v0, v1
2681; GFX7-NEXT:    s_endpgm
2682;
2683; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2684; GFX10-WGP:       ; %bb.0: ; %entry
2685; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2686; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2687; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2688; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2689; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2690; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2691; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2692; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2693; GFX10-WGP-NEXT:    s_endpgm
2694;
2695; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2696; GFX10-CU:       ; %bb.0: ; %entry
2697; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2698; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2700; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2701; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2702; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2703; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2704; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2705; GFX10-CU-NEXT:    s_endpgm
2706;
2707; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2708; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2709; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2710; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2711; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2712; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2716; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2717; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2718; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2719; SKIP-CACHE-INV-NEXT:    s_endpgm
2720;
2721; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2722; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2723; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2724; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2725; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2726; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2727; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2728; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2729; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2731; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2732;
2733; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
2734; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2735; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2736; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2737; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2738; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2739; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2740; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2741; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2742; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2743; GFX90A-TGSPLIT-NEXT:    s_endpgm
2744;
2745;
2746    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2747entry:
2748  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2749  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
2750  %val0 = extractvalue { i32, i1 } %val, 0
2751  store i32 %val0, i32 addrspace(3)* %out, align 4
2752  ret void
2753}
2754
2755define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
2756; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2757; GFX6:       ; %bb.0: ; %entry
2758; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2759; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2760; GFX6-NEXT:    s_mov_b32 m0, -1
2761; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2762; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2763; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2764; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2765; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2766; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2767; GFX6-NEXT:    ds_write_b32 v0, v1
2768; GFX6-NEXT:    s_endpgm
2769;
2770; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2771; GFX7:       ; %bb.0: ; %entry
2772; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2773; GFX7-NEXT:    s_mov_b32 m0, -1
2774; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2775; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2776; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2777; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2778; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2779; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2780; GFX7-NEXT:    ds_write_b32 v0, v1
2781; GFX7-NEXT:    s_endpgm
2782;
2783; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2784; GFX10-WGP:       ; %bb.0: ; %entry
2785; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2786; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2787; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2788; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2789; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2790; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2791; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2792; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2793; GFX10-WGP-NEXT:    s_endpgm
2794;
2795; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2796; GFX10-CU:       ; %bb.0: ; %entry
2797; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2798; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2799; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2800; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2801; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2802; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2803; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2804; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2805; GFX10-CU-NEXT:    s_endpgm
2806;
2807; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2808; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2809; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2810; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2811; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2812; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2816; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2817; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2818; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2819; SKIP-CACHE-INV-NEXT:    s_endpgm
2820;
2821; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2822; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2823; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2824; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2825; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2826; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2827; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2828; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2829; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2830; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2831; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2832;
2833; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
2834; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2835; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2836; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2838; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2839; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2840; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2841; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2843; GFX90A-TGSPLIT-NEXT:    s_endpgm
2844;
2845;
2846    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2847entry:
2848  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2849  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
2850  %val0 = extractvalue { i32, i1 } %val, 0
2851  store i32 %val0, i32 addrspace(3)* %out, align 4
2852  ret void
2853}
2854
2855define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
2856; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2857; GFX6:       ; %bb.0: ; %entry
2858; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
2859; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
2860; GFX6-NEXT:    s_mov_b32 m0, -1
2861; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2862; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2863; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2864; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2865; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2866; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2867; GFX6-NEXT:    ds_write_b32 v0, v1
2868; GFX6-NEXT:    s_endpgm
2869;
2870; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2871; GFX7:       ; %bb.0: ; %entry
2872; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2873; GFX7-NEXT:    s_mov_b32 m0, -1
2874; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2875; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2876; GFX7-NEXT:    v_mov_b32_e32 v1, s2
2877; GFX7-NEXT:    v_mov_b32_e32 v2, s1
2878; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2879; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2880; GFX7-NEXT:    ds_write_b32 v0, v1
2881; GFX7-NEXT:    s_endpgm
2882;
2883; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2884; GFX10-WGP:       ; %bb.0: ; %entry
2885; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2886; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2887; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2888; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2889; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
2890; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2891; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2892; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
2893; GFX10-WGP-NEXT:    s_endpgm
2894;
2895; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2896; GFX10-CU:       ; %bb.0: ; %entry
2897; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2898; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2900; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2901; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
2902; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2903; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2904; GFX10-CU-NEXT:    ds_write_b32 v0, v1
2905; GFX10-CU-NEXT:    s_endpgm
2906;
2907; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2908; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2909; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2910; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
2911; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
2912; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2913; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
2915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2916; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2917; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2918; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
2919; SKIP-CACHE-INV-NEXT:    s_endpgm
2920;
2921; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2922; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2923; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2924; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2925; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2926; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2927; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2928; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2929; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2930; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
2931; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2932;
2933; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
2934; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2935; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2936; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2937; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
2938; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2939; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
2940; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
2941; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
2943; GFX90A-TGSPLIT-NEXT:    s_endpgm
2944;
2945;
2946    i32 addrspace(3)* %out, i32 %in, i32 %old) {
2947entry:
2948  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
2949  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
2950  %val0 = extractvalue { i32, i1 } %val, 0
2951  store i32 %val0, i32 addrspace(3)* %out, align 4
2952  ret void
2953}
2954
2955define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
2956; GFX6-LABEL: local_singlethread_one_as_unordered_load:
2957; GFX6:       ; %bb.0: ; %entry
2958; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
2959; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
2960; GFX6-NEXT:    s_mov_b32 m0, -1
2961; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2962; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2963; GFX6-NEXT:    ds_read_b32 v0, v0
2964; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2965; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2966; GFX6-NEXT:    ds_write_b32 v1, v0
2967; GFX6-NEXT:    s_endpgm
2968;
2969; GFX7-LABEL: local_singlethread_one_as_unordered_load:
2970; GFX7:       ; %bb.0: ; %entry
2971; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2972; GFX7-NEXT:    s_mov_b32 m0, -1
2973; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2974; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2975; GFX7-NEXT:    ds_read_b32 v0, v0
2976; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2977; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2978; GFX7-NEXT:    ds_write_b32 v1, v0
2979; GFX7-NEXT:    s_endpgm
2980;
2981; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load:
2982; GFX10-WGP:       ; %bb.0: ; %entry
2983; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2984; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2985; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2986; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2987; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
2988; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2989; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
2990; GFX10-WGP-NEXT:    s_endpgm
2991;
2992; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load:
2993; GFX10-CU:       ; %bb.0: ; %entry
2994; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2995; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2996; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2997; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2998; GFX10-CU-NEXT:    ds_read_b32 v0, v0
2999; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3000; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3001; GFX10-CU-NEXT:    s_endpgm
3002;
3003; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load:
3004; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3005; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3006; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3007; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3009; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3010; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3011; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3012; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3013; SKIP-CACHE-INV-NEXT:    s_endpgm
3014;
3015; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
3016; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3017; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3018; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3019; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3020; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3021; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3022; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3023; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3024; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3025;
3026; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
3027; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3028; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3029; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3030; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3031; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3032; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3033; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3035; GFX90A-TGSPLIT-NEXT:    s_endpgm
3036;
3037;
3038    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3039entry:
3040  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4
3041  store i32 %val, i32 addrspace(3)* %out
3042  ret void
3043}
3044
3045define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
3046; GFX6-LABEL: local_singlethread_one_as_monotonic_load:
3047; GFX6:       ; %bb.0: ; %entry
3048; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3049; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3050; GFX6-NEXT:    s_mov_b32 m0, -1
3051; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3052; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3053; GFX6-NEXT:    ds_read_b32 v0, v0
3054; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3055; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3056; GFX6-NEXT:    ds_write_b32 v1, v0
3057; GFX6-NEXT:    s_endpgm
3058;
3059; GFX7-LABEL: local_singlethread_one_as_monotonic_load:
3060; GFX7:       ; %bb.0: ; %entry
3061; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3062; GFX7-NEXT:    s_mov_b32 m0, -1
3063; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3064; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3065; GFX7-NEXT:    ds_read_b32 v0, v0
3066; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3068; GFX7-NEXT:    ds_write_b32 v1, v0
3069; GFX7-NEXT:    s_endpgm
3070;
3071; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load:
3072; GFX10-WGP:       ; %bb.0: ; %entry
3073; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3074; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3075; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3076; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3077; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3078; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3079; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3080; GFX10-WGP-NEXT:    s_endpgm
3081;
3082; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load:
3083; GFX10-CU:       ; %bb.0: ; %entry
3084; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3085; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3086; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3087; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3088; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3089; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3090; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3091; GFX10-CU-NEXT:    s_endpgm
3092;
3093; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load:
3094; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3095; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3096; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3097; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3099; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3100; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3101; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3102; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3103; SKIP-CACHE-INV-NEXT:    s_endpgm
3104;
3105; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
3106; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3107; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3109; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3110; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3111; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3113; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3114; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3115;
3116; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
3117; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3119; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3120; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3121; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3122; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3123; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3124; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3125; GFX90A-TGSPLIT-NEXT:    s_endpgm
3126;
3127;
3128    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3129entry:
3130  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4
3131  store i32 %val, i32 addrspace(3)* %out
3132  ret void
3133}
3134
3135define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
3136; GFX6-LABEL: local_singlethread_one_as_acquire_load:
3137; GFX6:       ; %bb.0: ; %entry
3138; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3139; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3140; GFX6-NEXT:    s_mov_b32 m0, -1
3141; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3143; GFX6-NEXT:    ds_read_b32 v0, v0
3144; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3145; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3146; GFX6-NEXT:    ds_write_b32 v1, v0
3147; GFX6-NEXT:    s_endpgm
3148;
3149; GFX7-LABEL: local_singlethread_one_as_acquire_load:
3150; GFX7:       ; %bb.0: ; %entry
3151; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3152; GFX7-NEXT:    s_mov_b32 m0, -1
3153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3155; GFX7-NEXT:    ds_read_b32 v0, v0
3156; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3157; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3158; GFX7-NEXT:    ds_write_b32 v1, v0
3159; GFX7-NEXT:    s_endpgm
3160;
3161; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load:
3162; GFX10-WGP:       ; %bb.0: ; %entry
3163; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3164; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3165; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3166; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3167; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3168; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3169; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3170; GFX10-WGP-NEXT:    s_endpgm
3171;
3172; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load:
3173; GFX10-CU:       ; %bb.0: ; %entry
3174; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3175; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3176; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3177; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3178; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3179; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3180; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3181; GFX10-CU-NEXT:    s_endpgm
3182;
3183; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load:
3184; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3185; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3186; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3187; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3189; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3191; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3192; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3193; SKIP-CACHE-INV-NEXT:    s_endpgm
3194;
3195; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
3196; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3198; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3199; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3200; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3201; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3203; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3204; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3205;
3206; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
3207; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3208; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3209; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3211; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3212; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3213; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3214; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3215; GFX90A-TGSPLIT-NEXT:    s_endpgm
3216;
3217;
3218    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3219entry:
3220  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4
3221  store i32 %val, i32 addrspace(3)* %out
3222  ret void
3223}
3224
3225define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
3226; GFX6-LABEL: local_singlethread_one_as_seq_cst_load:
3227; GFX6:       ; %bb.0: ; %entry
3228; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3229; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3230; GFX6-NEXT:    s_mov_b32 m0, -1
3231; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3232; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3233; GFX6-NEXT:    ds_read_b32 v0, v0
3234; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3235; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3236; GFX6-NEXT:    ds_write_b32 v1, v0
3237; GFX6-NEXT:    s_endpgm
3238;
3239; GFX7-LABEL: local_singlethread_one_as_seq_cst_load:
3240; GFX7:       ; %bb.0: ; %entry
3241; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3242; GFX7-NEXT:    s_mov_b32 m0, -1
3243; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3245; GFX7-NEXT:    ds_read_b32 v0, v0
3246; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3247; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3248; GFX7-NEXT:    ds_write_b32 v1, v0
3249; GFX7-NEXT:    s_endpgm
3250;
3251; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load:
3252; GFX10-WGP:       ; %bb.0: ; %entry
3253; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3254; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3255; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3256; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3257; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
3258; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
3260; GFX10-WGP-NEXT:    s_endpgm
3261;
3262; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load:
3263; GFX10-CU:       ; %bb.0: ; %entry
3264; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3265; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3266; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3267; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3268; GFX10-CU-NEXT:    ds_read_b32 v0, v0
3269; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3270; GFX10-CU-NEXT:    ds_write_b32 v1, v0
3271; GFX10-CU-NEXT:    s_endpgm
3272;
3273; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load:
3274; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3275; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3276; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3277; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3278; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3279; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
3280; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3281; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3282; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
3283; SKIP-CACHE-INV-NEXT:    s_endpgm
3284;
3285; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
3286; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3288; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3289; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3290; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
3291; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3292; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
3294; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3295;
3296; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
3297; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3298; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3299; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3300; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3301; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
3302; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3303; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3304; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
3305; GFX90A-TGSPLIT-NEXT:    s_endpgm
3306;
3307;
3308    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
3309entry:
3310  %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4
3311  store i32 %val, i32 addrspace(3)* %out
3312  ret void
3313}
3314
3315define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
3316; GFX6-LABEL: local_singlethread_one_as_unordered_store:
3317; GFX6:       ; %bb.0: ; %entry
3318; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3319; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3320; GFX6-NEXT:    s_mov_b32 m0, -1
3321; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3322; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3323; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3324; GFX6-NEXT:    ds_write_b32 v0, v1
3325; GFX6-NEXT:    s_endpgm
3326;
3327; GFX7-LABEL: local_singlethread_one_as_unordered_store:
3328; GFX7:       ; %bb.0: ; %entry
3329; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3330; GFX7-NEXT:    s_mov_b32 m0, -1
3331; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3332; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3333; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3334; GFX7-NEXT:    ds_write_b32 v0, v1
3335; GFX7-NEXT:    s_endpgm
3336;
3337; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store:
3338; GFX10-WGP:       ; %bb.0: ; %entry
3339; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3340; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3341; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3342; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3343; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3344; GFX10-WGP-NEXT:    s_endpgm
3345;
3346; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store:
3347; GFX10-CU:       ; %bb.0: ; %entry
3348; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3349; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3350; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3351; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3352; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3353; GFX10-CU-NEXT:    s_endpgm
3354;
3355; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store:
3356; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3357; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3358; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3359; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3361; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3362; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3363; SKIP-CACHE-INV-NEXT:    s_endpgm
3364;
3365; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
3366; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3367; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3368; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3369; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3370; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3371; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3372; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3373;
3374; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
3375; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3376; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3377; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3378; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3379; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3380; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3381; GFX90A-TGSPLIT-NEXT:    s_endpgm
3382;
3383;
3384    i32 %in, i32 addrspace(3)* %out) {
3385entry:
3386  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4
3387  ret void
3388}
3389
3390define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
3391; GFX6-LABEL: local_singlethread_one_as_monotonic_store:
3392; GFX6:       ; %bb.0: ; %entry
3393; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3394; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3395; GFX6-NEXT:    s_mov_b32 m0, -1
3396; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3397; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3398; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3399; GFX6-NEXT:    ds_write_b32 v0, v1
3400; GFX6-NEXT:    s_endpgm
3401;
3402; GFX7-LABEL: local_singlethread_one_as_monotonic_store:
3403; GFX7:       ; %bb.0: ; %entry
3404; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3405; GFX7-NEXT:    s_mov_b32 m0, -1
3406; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3407; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3408; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3409; GFX7-NEXT:    ds_write_b32 v0, v1
3410; GFX7-NEXT:    s_endpgm
3411;
3412; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store:
3413; GFX10-WGP:       ; %bb.0: ; %entry
3414; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3415; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3417; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3418; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3419; GFX10-WGP-NEXT:    s_endpgm
3420;
3421; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store:
3422; GFX10-CU:       ; %bb.0: ; %entry
3423; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3424; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3425; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3426; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3427; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3428; GFX10-CU-NEXT:    s_endpgm
3429;
3430; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store:
3431; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3432; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3433; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3434; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3437; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3438; SKIP-CACHE-INV-NEXT:    s_endpgm
3439;
3440; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
3441; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3442; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3443; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3444; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3445; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3446; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3447; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3448;
3449; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
3450; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3451; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3452; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3453; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3454; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3455; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3456; GFX90A-TGSPLIT-NEXT:    s_endpgm
3457;
3458;
3459    i32 %in, i32 addrspace(3)* %out) {
3460entry:
3461  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4
3462  ret void
3463}
3464
3465define amdgpu_kernel void @local_singlethread_one_as_release_store(
3466; GFX6-LABEL: local_singlethread_one_as_release_store:
3467; GFX6:       ; %bb.0: ; %entry
3468; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3469; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3470; GFX6-NEXT:    s_mov_b32 m0, -1
3471; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3472; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3473; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3474; GFX6-NEXT:    ds_write_b32 v0, v1
3475; GFX6-NEXT:    s_endpgm
3476;
3477; GFX7-LABEL: local_singlethread_one_as_release_store:
3478; GFX7:       ; %bb.0: ; %entry
3479; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3480; GFX7-NEXT:    s_mov_b32 m0, -1
3481; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3482; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3483; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3484; GFX7-NEXT:    ds_write_b32 v0, v1
3485; GFX7-NEXT:    s_endpgm
3486;
3487; GFX10-WGP-LABEL: local_singlethread_one_as_release_store:
3488; GFX10-WGP:       ; %bb.0: ; %entry
3489; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3490; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3491; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3492; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3493; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3494; GFX10-WGP-NEXT:    s_endpgm
3495;
3496; GFX10-CU-LABEL: local_singlethread_one_as_release_store:
3497; GFX10-CU:       ; %bb.0: ; %entry
3498; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3499; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3500; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3501; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3502; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3503; GFX10-CU-NEXT:    s_endpgm
3504;
3505; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store:
3506; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3507; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3508; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3509; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3510; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3511; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3512; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3513; SKIP-CACHE-INV-NEXT:    s_endpgm
3514;
3515; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store:
3516; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3517; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3518; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3520; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3521; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3522; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3523;
3524; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store:
3525; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3526; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3527; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3529; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3530; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3531; GFX90A-TGSPLIT-NEXT:    s_endpgm
3532;
3533;
3534    i32 %in, i32 addrspace(3)* %out) {
3535entry:
3536  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4
3537  ret void
3538}
3539
3540define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
3541; GFX6-LABEL: local_singlethread_one_as_seq_cst_store:
3542; GFX6:       ; %bb.0: ; %entry
3543; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3544; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3545; GFX6-NEXT:    s_mov_b32 m0, -1
3546; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3547; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3548; GFX6-NEXT:    v_mov_b32_e32 v0, s1
3549; GFX6-NEXT:    ds_write_b32 v0, v1
3550; GFX6-NEXT:    s_endpgm
3551;
3552; GFX7-LABEL: local_singlethread_one_as_seq_cst_store:
3553; GFX7:       ; %bb.0: ; %entry
3554; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3555; GFX7-NEXT:    s_mov_b32 m0, -1
3556; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3557; GFX7-NEXT:    v_mov_b32_e32 v0, s1
3558; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3559; GFX7-NEXT:    ds_write_b32 v0, v1
3560; GFX7-NEXT:    s_endpgm
3561;
3562; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store:
3563; GFX10-WGP:       ; %bb.0: ; %entry
3564; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3566; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
3567; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
3568; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
3569; GFX10-WGP-NEXT:    s_endpgm
3570;
3571; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store:
3572; GFX10-CU:       ; %bb.0: ; %entry
3573; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3574; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3575; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
3576; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
3577; GFX10-CU-NEXT:    ds_write_b32 v0, v1
3578; GFX10-CU-NEXT:    s_endpgm
3579;
3580; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store:
3581; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3582; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3583; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3584; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3585; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
3586; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
3587; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
3588; SKIP-CACHE-INV-NEXT:    s_endpgm
3589;
3590; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
3591; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3592; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3593; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3594; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3595; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3596; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
3597; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3598;
3599; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
3600; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3601; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3602; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3603; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
3604; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
3605; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
3606; GFX90A-TGSPLIT-NEXT:    s_endpgm
3607;
3608;
3609    i32 %in, i32 addrspace(3)* %out) {
3610entry:
3611  store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4
3612  ret void
3613}
3614
3615define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
3616; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3617; GFX6:       ; %bb.0: ; %entry
3618; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3619; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3620; GFX6-NEXT:    s_mov_b32 m0, -1
3621; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3622; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3623; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3624; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3625; GFX6-NEXT:    s_endpgm
3626;
3627; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3628; GFX7:       ; %bb.0: ; %entry
3629; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3630; GFX7-NEXT:    s_mov_b32 m0, -1
3631; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3632; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3633; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3634; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3635; GFX7-NEXT:    s_endpgm
3636;
3637; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3638; GFX10-WGP:       ; %bb.0: ; %entry
3639; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3640; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3641; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3642; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3643; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3644; GFX10-WGP-NEXT:    s_endpgm
3645;
3646; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3647; GFX10-CU:       ; %bb.0: ; %entry
3648; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3649; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3650; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3651; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3652; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3653; GFX10-CU-NEXT:    s_endpgm
3654;
3655; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3656; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3657; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3658; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3659; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3661; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3662; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3663; SKIP-CACHE-INV-NEXT:    s_endpgm
3664;
3665; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3666; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3667; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3668; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3669; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3670; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3671; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3672; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3673;
3674; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
3675; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3676; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3677; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3679; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3680; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3681; GFX90A-TGSPLIT-NEXT:    s_endpgm
3682;
3683;
3684    i32 addrspace(3)* %out, i32 %in) {
3685entry:
3686  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic
3687  ret void
3688}
3689
3690define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
3691; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3692; GFX6:       ; %bb.0: ; %entry
3693; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3694; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3695; GFX6-NEXT:    s_mov_b32 m0, -1
3696; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3697; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3698; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3699; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3700; GFX6-NEXT:    s_endpgm
3701;
3702; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3703; GFX7:       ; %bb.0: ; %entry
3704; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3705; GFX7-NEXT:    s_mov_b32 m0, -1
3706; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3707; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3708; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3709; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3710; GFX7-NEXT:    s_endpgm
3711;
3712; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3713; GFX10-WGP:       ; %bb.0: ; %entry
3714; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3715; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3717; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3718; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3719; GFX10-WGP-NEXT:    s_endpgm
3720;
3721; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3722; GFX10-CU:       ; %bb.0: ; %entry
3723; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3724; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3725; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3726; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3727; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3728; GFX10-CU-NEXT:    s_endpgm
3729;
3730; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3731; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3732; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3733; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3734; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3735; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3737; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3738; SKIP-CACHE-INV-NEXT:    s_endpgm
3739;
3740; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3741; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3742; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3743; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3744; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3745; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3746; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3747; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3748;
3749; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
3750; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3751; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3752; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3753; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3754; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3755; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3756; GFX90A-TGSPLIT-NEXT:    s_endpgm
3757;
3758;
3759    i32 addrspace(3)* %out, i32 %in) {
3760entry:
3761  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
3762  ret void
3763}
3764
3765define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
3766; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw:
3767; GFX6:       ; %bb.0: ; %entry
3768; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3769; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3770; GFX6-NEXT:    s_mov_b32 m0, -1
3771; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3772; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3773; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3774; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3775; GFX6-NEXT:    s_endpgm
3776;
3777; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw:
3778; GFX7:       ; %bb.0: ; %entry
3779; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3780; GFX7-NEXT:    s_mov_b32 m0, -1
3781; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3782; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3783; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3784; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3785; GFX7-NEXT:    s_endpgm
3786;
3787; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw:
3788; GFX10-WGP:       ; %bb.0: ; %entry
3789; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3790; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3791; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3792; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3793; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3794; GFX10-WGP-NEXT:    s_endpgm
3795;
3796; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw:
3797; GFX10-CU:       ; %bb.0: ; %entry
3798; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3799; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3800; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3801; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3802; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3803; GFX10-CU-NEXT:    s_endpgm
3804;
3805; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw:
3806; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3807; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3808; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3811; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3812; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3813; SKIP-CACHE-INV-NEXT:    s_endpgm
3814;
3815; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
3816; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3817; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3818; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3820; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3821; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3822; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3823;
3824; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
3825; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3826; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3827; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3828; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3829; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3830; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3831; GFX90A-TGSPLIT-NEXT:    s_endpgm
3832;
3833;
3834    i32 addrspace(3)* %out, i32 %in) {
3835entry:
3836  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release
3837  ret void
3838}
3839
3840define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
3841; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3842; GFX6:       ; %bb.0: ; %entry
3843; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3844; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3845; GFX6-NEXT:    s_mov_b32 m0, -1
3846; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3847; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3848; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3849; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3850; GFX6-NEXT:    s_endpgm
3851;
3852; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3853; GFX7:       ; %bb.0: ; %entry
3854; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3855; GFX7-NEXT:    s_mov_b32 m0, -1
3856; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3857; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3858; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3859; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3860; GFX7-NEXT:    s_endpgm
3861;
3862; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3863; GFX10-WGP:       ; %bb.0: ; %entry
3864; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3865; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3866; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3867; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3868; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3869; GFX10-WGP-NEXT:    s_endpgm
3870;
3871; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3872; GFX10-CU:       ; %bb.0: ; %entry
3873; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3874; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3875; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3876; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3877; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3878; GFX10-CU-NEXT:    s_endpgm
3879;
3880; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3881; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3882; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3883; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3884; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3885; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3886; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3887; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3888; SKIP-CACHE-INV-NEXT:    s_endpgm
3889;
3890; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3891; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3892; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3893; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3894; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3895; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3896; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3897; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3898;
3899; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
3900; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3901; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3902; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3903; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3904; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3905; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3906; GFX90A-TGSPLIT-NEXT:    s_endpgm
3907;
3908;
3909    i32 addrspace(3)* %out, i32 %in) {
3910entry:
3911  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
3912  ret void
3913}
3914
3915define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
3916; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3917; GFX6:       ; %bb.0: ; %entry
3918; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3919; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3920; GFX6-NEXT:    s_mov_b32 m0, -1
3921; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3922; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3923; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3924; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3925; GFX6-NEXT:    s_endpgm
3926;
3927; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3928; GFX7:       ; %bb.0: ; %entry
3929; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3930; GFX7-NEXT:    s_mov_b32 m0, -1
3931; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3932; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3933; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3934; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3935; GFX7-NEXT:    s_endpgm
3936;
3937; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3938; GFX10-WGP:       ; %bb.0: ; %entry
3939; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3940; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3941; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3942; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3943; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3944; GFX10-WGP-NEXT:    s_endpgm
3945;
3946; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3947; GFX10-CU:       ; %bb.0: ; %entry
3948; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3949; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3950; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3951; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3952; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3953; GFX10-CU-NEXT:    s_endpgm
3954;
3955; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3956; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3957; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3958; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
3959; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3960; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3962; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3963; SKIP-CACHE-INV-NEXT:    s_endpgm
3964;
3965; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3966; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3967; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3968; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3969; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3970; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3971; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3972; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3973;
3974; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
3975; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3976; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3977; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3978; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3979; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3980; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
3981; GFX90A-TGSPLIT-NEXT:    s_endpgm
3982;
3983;
3984    i32 addrspace(3)* %out, i32 %in) {
3985entry:
3986  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
3987  ret void
3988}
3989
3990define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
3991; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
3992; GFX6:       ; %bb.0: ; %entry
3993; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
3994; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
3995; GFX6-NEXT:    s_mov_b32 m0, -1
3996; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3997; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3998; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3999; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4000; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4001; GFX6-NEXT:    ds_write_b32 v0, v1
4002; GFX6-NEXT:    s_endpgm
4003;
4004; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4005; GFX7:       ; %bb.0: ; %entry
4006; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4007; GFX7-NEXT:    s_mov_b32 m0, -1
4008; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4009; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4010; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4011; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4013; GFX7-NEXT:    ds_write_b32 v0, v1
4014; GFX7-NEXT:    s_endpgm
4015;
4016; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4017; GFX10-WGP:       ; %bb.0: ; %entry
4018; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4019; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4020; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4021; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4022; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4023; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4024; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4025; GFX10-WGP-NEXT:    s_endpgm
4026;
4027; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4028; GFX10-CU:       ; %bb.0: ; %entry
4029; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4030; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4031; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4032; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4033; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4034; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4035; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4036; GFX10-CU-NEXT:    s_endpgm
4037;
4038; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4039; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4040; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4041; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4042; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4043; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4044; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4045; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4046; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4047; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4048; SKIP-CACHE-INV-NEXT:    s_endpgm
4049;
4050; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4051; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4052; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4053; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4054; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4055; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4056; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4057; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4058; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4059; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4060;
4061; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
4062; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4063; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4064; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4065; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4066; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4067; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4068; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4069; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4070; GFX90A-TGSPLIT-NEXT:    s_endpgm
4071;
4072;
4073    i32 addrspace(3)* %out, i32 %in) {
4074entry:
4075  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
4076  store i32 %val, i32 addrspace(3)* %out, align 4
4077  ret void
4078}
4079
4080define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
4081; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4082; GFX6:       ; %bb.0: ; %entry
4083; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4084; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4085; GFX6-NEXT:    s_mov_b32 m0, -1
4086; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4087; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4088; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4089; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4090; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4091; GFX6-NEXT:    ds_write_b32 v0, v1
4092; GFX6-NEXT:    s_endpgm
4093;
4094; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4095; GFX7:       ; %bb.0: ; %entry
4096; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4097; GFX7-NEXT:    s_mov_b32 m0, -1
4098; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4099; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4100; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4101; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4102; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4103; GFX7-NEXT:    ds_write_b32 v0, v1
4104; GFX7-NEXT:    s_endpgm
4105;
4106; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4107; GFX10-WGP:       ; %bb.0: ; %entry
4108; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4109; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4110; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4111; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4112; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4113; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4114; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4115; GFX10-WGP-NEXT:    s_endpgm
4116;
4117; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4118; GFX10-CU:       ; %bb.0: ; %entry
4119; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4120; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4121; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4122; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4123; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4124; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4125; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4126; GFX10-CU-NEXT:    s_endpgm
4127;
4128; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4129; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4130; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4131; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4132; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4133; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4134; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4135; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4136; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4137; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4138; SKIP-CACHE-INV-NEXT:    s_endpgm
4139;
4140; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4141; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4142; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4143; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4144; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4145; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4146; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4147; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4148; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4149; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4150;
4151; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
4152; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4153; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4154; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4155; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4156; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4157; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4158; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4159; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4160; GFX90A-TGSPLIT-NEXT:    s_endpgm
4161;
4162;
4163    i32 addrspace(3)* %out, i32 %in) {
4164entry:
4165  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
4166  store i32 %val, i32 addrspace(3)* %out, align 4
4167  ret void
4168}
4169
4170define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
4171; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4172; GFX6:       ; %bb.0: ; %entry
4173; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x0
4174; GFX6-NEXT:    s_load_dword s1, s[4:5], 0x1
4175; GFX6-NEXT:    s_mov_b32 m0, -1
4176; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4177; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4178; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4179; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4180; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4181; GFX6-NEXT:    ds_write_b32 v0, v1
4182; GFX6-NEXT:    s_endpgm
4183;
4184; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4185; GFX7:       ; %bb.0: ; %entry
4186; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4187; GFX7-NEXT:    s_mov_b32 m0, -1
4188; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4189; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4190; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4191; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4192; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4193; GFX7-NEXT:    ds_write_b32 v0, v1
4194; GFX7-NEXT:    s_endpgm
4195;
4196; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4197; GFX10-WGP:       ; %bb.0: ; %entry
4198; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4199; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4200; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4201; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4202; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4203; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4204; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
4205; GFX10-WGP-NEXT:    s_endpgm
4206;
4207; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4208; GFX10-CU:       ; %bb.0: ; %entry
4209; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4210; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4211; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4212; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4213; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4214; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4215; GFX10-CU-NEXT:    ds_write_b32 v0, v1
4216; GFX10-CU-NEXT:    s_endpgm
4217;
4218; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4219; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4220; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4221; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4222; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4225; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4226; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4227; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
4228; SKIP-CACHE-INV-NEXT:    s_endpgm
4229;
4230; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4231; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4232; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4233; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4234; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4235; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4236; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4237; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4238; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
4239; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4240;
4241; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
4242; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4243; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4244; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4245; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4246; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4247; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
4248; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4249; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
4250; GFX90A-TGSPLIT-NEXT:    s_endpgm
4251;
4252;
4253    i32 addrspace(3)* %out, i32 %in) {
4254entry:
4255  %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
4256  store i32 %val, i32 addrspace(3)* %out, align 4
4257  ret void
4258}
4259
4260define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg(
4261; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4262; GFX6:       ; %bb.0: ; %entry
4263; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4264; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4265; GFX6-NEXT:    s_mov_b32 m0, -1
4266; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4267; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4268; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4269; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4270; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4271; GFX6-NEXT:    s_endpgm
4272;
4273; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4274; GFX7:       ; %bb.0: ; %entry
4275; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4276; GFX7-NEXT:    s_mov_b32 m0, -1
4277; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4278; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4279; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4280; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4281; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4282; GFX7-NEXT:    s_endpgm
4283;
4284; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4285; GFX10-WGP:       ; %bb.0: ; %entry
4286; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4287; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4289; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4290; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4291; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4292; GFX10-WGP-NEXT:    s_endpgm
4293;
4294; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4295; GFX10-CU:       ; %bb.0: ; %entry
4296; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4297; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4298; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4299; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4300; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4301; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4302; GFX10-CU-NEXT:    s_endpgm
4303;
4304; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4305; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4306; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4307; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4308; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4309; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4313; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4314; SKIP-CACHE-INV-NEXT:    s_endpgm
4315;
4316; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4317; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4318; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4319; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4320; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4321; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4322; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4323; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4324; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4325;
4326; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
4327; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4328; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4329; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4330; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4331; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4333; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4334; GFX90A-TGSPLIT-NEXT:    s_endpgm
4335;
4336;
4337    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4338entry:
4339  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4340  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
4341  ret void
4342}
4343
4344define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
4345; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4346; GFX6:       ; %bb.0: ; %entry
4347; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4348; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4349; GFX6-NEXT:    s_mov_b32 m0, -1
4350; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4351; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4352; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4353; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4354; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4355; GFX6-NEXT:    s_endpgm
4356;
4357; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4358; GFX7:       ; %bb.0: ; %entry
4359; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4360; GFX7-NEXT:    s_mov_b32 m0, -1
4361; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4362; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4363; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4364; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4365; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4366; GFX7-NEXT:    s_endpgm
4367;
4368; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4369; GFX10-WGP:       ; %bb.0: ; %entry
4370; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4371; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4372; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4373; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4374; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4375; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4376; GFX10-WGP-NEXT:    s_endpgm
4377;
4378; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4379; GFX10-CU:       ; %bb.0: ; %entry
4380; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4381; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4382; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4383; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4384; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4385; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4386; GFX10-CU-NEXT:    s_endpgm
4387;
4388; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4389; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4390; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4391; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4392; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4393; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4394; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4395; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4396; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4397; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4398; SKIP-CACHE-INV-NEXT:    s_endpgm
4399;
4400; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4401; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4402; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4403; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4404; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4405; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4406; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4407; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4408; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4409;
4410; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
4411; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4412; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4413; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4414; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4415; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4416; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4417; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4418; GFX90A-TGSPLIT-NEXT:    s_endpgm
4419;
4420;
4421    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4422entry:
4423  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4424  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
4425  ret void
4426}
4427
4428define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
4429; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4430; GFX6:       ; %bb.0: ; %entry
4431; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4432; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4433; GFX6-NEXT:    s_mov_b32 m0, -1
4434; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4435; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4436; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4437; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4438; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4439; GFX6-NEXT:    s_endpgm
4440;
4441; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4442; GFX7:       ; %bb.0: ; %entry
4443; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4444; GFX7-NEXT:    s_mov_b32 m0, -1
4445; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4446; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4447; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4448; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4449; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4450; GFX7-NEXT:    s_endpgm
4451;
4452; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4453; GFX10-WGP:       ; %bb.0: ; %entry
4454; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4455; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4456; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4457; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4458; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4459; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4460; GFX10-WGP-NEXT:    s_endpgm
4461;
4462; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4463; GFX10-CU:       ; %bb.0: ; %entry
4464; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4465; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4466; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4467; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4468; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4469; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4470; GFX10-CU-NEXT:    s_endpgm
4471;
4472; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4473; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4474; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4475; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4476; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4477; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4478; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4479; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4480; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4481; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4482; SKIP-CACHE-INV-NEXT:    s_endpgm
4483;
4484; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4485; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4486; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4487; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4488; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4489; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4490; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4491; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4492; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4493;
4494; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
4495; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4496; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4497; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4498; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4499; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4500; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4501; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4502; GFX90A-TGSPLIT-NEXT:    s_endpgm
4503;
4504;
4505    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4506entry:
4507  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4508  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
4509  ret void
4510}
4511
4512define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
4513; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4514; GFX6:       ; %bb.0: ; %entry
4515; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4516; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4517; GFX6-NEXT:    s_mov_b32 m0, -1
4518; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4519; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4520; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4521; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4522; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4523; GFX6-NEXT:    s_endpgm
4524;
4525; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4526; GFX7:       ; %bb.0: ; %entry
4527; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4528; GFX7-NEXT:    s_mov_b32 m0, -1
4529; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4530; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4531; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4532; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4533; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4534; GFX7-NEXT:    s_endpgm
4535;
4536; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4537; GFX10-WGP:       ; %bb.0: ; %entry
4538; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4539; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4540; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4541; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4542; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4543; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4544; GFX10-WGP-NEXT:    s_endpgm
4545;
4546; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4547; GFX10-CU:       ; %bb.0: ; %entry
4548; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4549; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4550; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4551; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4552; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4553; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4554; GFX10-CU-NEXT:    s_endpgm
4555;
4556; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4557; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4558; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4559; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4560; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4561; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4565; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4566; SKIP-CACHE-INV-NEXT:    s_endpgm
4567;
4568; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4569; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4570; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4571; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4572; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4573; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4574; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4575; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4576; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4577;
4578; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
4579; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4580; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4581; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4582; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4583; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4585; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4586; GFX90A-TGSPLIT-NEXT:    s_endpgm
4587;
4588;
4589    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4590entry:
4591  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4592  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
4593  ret void
4594}
4595
4596define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
4597; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4598; GFX6:       ; %bb.0: ; %entry
4599; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4600; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4601; GFX6-NEXT:    s_mov_b32 m0, -1
4602; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4603; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4604; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4605; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4606; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4607; GFX6-NEXT:    s_endpgm
4608;
4609; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4610; GFX7:       ; %bb.0: ; %entry
4611; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4612; GFX7-NEXT:    s_mov_b32 m0, -1
4613; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4614; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4615; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4616; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4617; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4618; GFX7-NEXT:    s_endpgm
4619;
4620; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4621; GFX10-WGP:       ; %bb.0: ; %entry
4622; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4623; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4624; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4625; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4626; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4627; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4628; GFX10-WGP-NEXT:    s_endpgm
4629;
4630; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4631; GFX10-CU:       ; %bb.0: ; %entry
4632; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4633; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4634; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4635; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4636; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4637; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4638; GFX10-CU-NEXT:    s_endpgm
4639;
4640; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4641; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4642; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4643; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4644; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4645; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4649; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4650; SKIP-CACHE-INV-NEXT:    s_endpgm
4651;
4652; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4653; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4655; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4656; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4657; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4659; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4660; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4661;
4662; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
4663; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4664; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4665; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4666; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4667; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4668; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4669; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4670; GFX90A-TGSPLIT-NEXT:    s_endpgm
4671;
4672;
4673    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4674entry:
4675  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4676  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
4677  ret void
4678}
4679
4680define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
4681; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4682; GFX6:       ; %bb.0: ; %entry
4683; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4684; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4685; GFX6-NEXT:    s_mov_b32 m0, -1
4686; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4687; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4688; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4689; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4690; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4691; GFX6-NEXT:    s_endpgm
4692;
4693; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4694; GFX7:       ; %bb.0: ; %entry
4695; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4696; GFX7-NEXT:    s_mov_b32 m0, -1
4697; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4698; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4699; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4700; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4701; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4702; GFX7-NEXT:    s_endpgm
4703;
4704; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4705; GFX10-WGP:       ; %bb.0: ; %entry
4706; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4707; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4708; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4709; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4710; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4711; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4712; GFX10-WGP-NEXT:    s_endpgm
4713;
4714; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4715; GFX10-CU:       ; %bb.0: ; %entry
4716; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4717; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4718; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4719; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4720; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4721; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4722; GFX10-CU-NEXT:    s_endpgm
4723;
4724; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4725; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4726; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4727; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4728; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4729; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4730; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4731; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4732; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4733; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4734; SKIP-CACHE-INV-NEXT:    s_endpgm
4735;
4736; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4737; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4738; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4739; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4740; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4741; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4742; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4743; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4744; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4745;
4746; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
4747; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4748; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4749; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4751; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4752; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4753; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4754; GFX90A-TGSPLIT-NEXT:    s_endpgm
4755;
4756;
4757    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4758entry:
4759  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4760  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
4761  ret void
4762}
4763
4764define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
4765; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4766; GFX6:       ; %bb.0: ; %entry
4767; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4768; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4769; GFX6-NEXT:    s_mov_b32 m0, -1
4770; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4771; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4772; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4773; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4774; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4775; GFX6-NEXT:    s_endpgm
4776;
4777; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4778; GFX7:       ; %bb.0: ; %entry
4779; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4780; GFX7-NEXT:    s_mov_b32 m0, -1
4781; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4782; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4783; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4784; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4785; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4786; GFX7-NEXT:    s_endpgm
4787;
4788; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4789; GFX10-WGP:       ; %bb.0: ; %entry
4790; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4791; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4792; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4793; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4794; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4795; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4796; GFX10-WGP-NEXT:    s_endpgm
4797;
4798; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4799; GFX10-CU:       ; %bb.0: ; %entry
4800; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4801; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4802; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4803; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4804; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4805; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4806; GFX10-CU-NEXT:    s_endpgm
4807;
4808; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4809; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4810; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4811; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4812; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4813; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4816; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4817; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4818; SKIP-CACHE-INV-NEXT:    s_endpgm
4819;
4820; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4821; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4822; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4823; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4824; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4825; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4826; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4827; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4828; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4829;
4830; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
4831; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4832; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4833; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4834; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4835; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4836; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4837; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4838; GFX90A-TGSPLIT-NEXT:    s_endpgm
4839;
4840;
4841    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4842entry:
4843  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4844  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
4845  ret void
4846}
4847
4848define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
4849; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4850; GFX6:       ; %bb.0: ; %entry
4851; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4852; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4853; GFX6-NEXT:    s_mov_b32 m0, -1
4854; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4855; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4856; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4857; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4858; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4859; GFX6-NEXT:    s_endpgm
4860;
4861; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4862; GFX7:       ; %bb.0: ; %entry
4863; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4864; GFX7-NEXT:    s_mov_b32 m0, -1
4865; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4866; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4867; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4868; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4869; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4870; GFX7-NEXT:    s_endpgm
4871;
4872; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4873; GFX10-WGP:       ; %bb.0: ; %entry
4874; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4875; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4876; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4877; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4878; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4879; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4880; GFX10-WGP-NEXT:    s_endpgm
4881;
4882; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4883; GFX10-CU:       ; %bb.0: ; %entry
4884; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4885; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4886; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4887; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4888; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4889; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4890; GFX10-CU-NEXT:    s_endpgm
4891;
4892; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4893; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4894; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4895; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4896; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4897; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4900; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4901; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4902; SKIP-CACHE-INV-NEXT:    s_endpgm
4903;
4904; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4905; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4906; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4907; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4908; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4909; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4910; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4911; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4912; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4913;
4914; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
4915; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4916; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4917; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4918; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4919; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4920; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4921; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4922; GFX90A-TGSPLIT-NEXT:    s_endpgm
4923;
4924;
4925    i32 addrspace(3)* %out, i32 %in, i32 %old) {
4926entry:
4927  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
4928  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
4929  ret void
4930}
4931
4932define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
4933; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4934; GFX6:       ; %bb.0: ; %entry
4935; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
4936; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
4937; GFX6-NEXT:    s_mov_b32 m0, -1
4938; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4939; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4940; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4941; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4942; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4943; GFX6-NEXT:    s_endpgm
4944;
4945; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4946; GFX7:       ; %bb.0: ; %entry
4947; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4948; GFX7-NEXT:    s_mov_b32 m0, -1
4949; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4950; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4951; GFX7-NEXT:    v_mov_b32_e32 v1, s2
4952; GFX7-NEXT:    v_mov_b32_e32 v2, s1
4953; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4954; GFX7-NEXT:    s_endpgm
4955;
4956; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4957; GFX10-WGP:       ; %bb.0: ; %entry
4958; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4959; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4960; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4961; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
4962; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
4963; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4964; GFX10-WGP-NEXT:    s_endpgm
4965;
4966; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4967; GFX10-CU:       ; %bb.0: ; %entry
4968; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4969; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4970; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4971; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
4972; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
4973; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4974; GFX10-CU-NEXT:    s_endpgm
4975;
4976; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4977; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4978; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4979; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4980; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
4981; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
4984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4985; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4986; SKIP-CACHE-INV-NEXT:    s_endpgm
4987;
4988; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4989; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4990; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4991; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4992; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4993; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4994; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
4995; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
4996; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4997;
4998; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
4999; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5000; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5001; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5002; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5003; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5004; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5005; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5006; GFX90A-TGSPLIT-NEXT:    s_endpgm
5007;
5008;
5009    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5010entry:
5011  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5012  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
5013  ret void
5014}
5015
5016define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
5017; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5018; GFX6:       ; %bb.0: ; %entry
5019; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5020; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5021; GFX6-NEXT:    s_mov_b32 m0, -1
5022; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5023; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5024; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5025; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5026; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5027; GFX6-NEXT:    s_endpgm
5028;
5029; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5030; GFX7:       ; %bb.0: ; %entry
5031; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5032; GFX7-NEXT:    s_mov_b32 m0, -1
5033; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5034; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5035; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5036; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5037; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5038; GFX7-NEXT:    s_endpgm
5039;
5040; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5041; GFX10-WGP:       ; %bb.0: ; %entry
5042; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5043; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5044; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5045; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5046; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5047; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5048; GFX10-WGP-NEXT:    s_endpgm
5049;
5050; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5051; GFX10-CU:       ; %bb.0: ; %entry
5052; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5053; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5055; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5056; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5057; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5058; GFX10-CU-NEXT:    s_endpgm
5059;
5060; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5061; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5062; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5063; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5064; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5065; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5066; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5069; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5070; SKIP-CACHE-INV-NEXT:    s_endpgm
5071;
5072; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5073; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5074; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5075; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5076; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5077; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5078; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5079; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5080; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5081;
5082; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
5083; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5084; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5085; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5086; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5087; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5088; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5089; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
5090; GFX90A-TGSPLIT-NEXT:    s_endpgm
5091;
5092;
5093    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5094entry:
5095  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5096  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
5097  ret void
5098}
5099
5100define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
5101; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5102; GFX6:       ; %bb.0: ; %entry
5103; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5104; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5105; GFX6-NEXT:    s_mov_b32 m0, -1
5106; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5107; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5108; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5109; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5110; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5111; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5112; GFX6-NEXT:    ds_write_b32 v0, v1
5113; GFX6-NEXT:    s_endpgm
5114;
5115; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5116; GFX7:       ; %bb.0: ; %entry
5117; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5118; GFX7-NEXT:    s_mov_b32 m0, -1
5119; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5120; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5121; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5122; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5123; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5124; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5125; GFX7-NEXT:    ds_write_b32 v0, v1
5126; GFX7-NEXT:    s_endpgm
5127;
5128; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5129; GFX10-WGP:       ; %bb.0: ; %entry
5130; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5131; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5132; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5133; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5134; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5135; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5136; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5137; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5138; GFX10-WGP-NEXT:    s_endpgm
5139;
5140; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5141; GFX10-CU:       ; %bb.0: ; %entry
5142; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5143; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5144; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5145; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5146; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5147; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5148; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5149; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5150; GFX10-CU-NEXT:    s_endpgm
5151;
5152; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5153; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5154; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5155; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5156; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5157; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5158; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5159; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5160; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5161; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5162; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5163; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5164; SKIP-CACHE-INV-NEXT:    s_endpgm
5165;
5166; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5167; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5168; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5169; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5170; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5171; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5172; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5173; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5174; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5175; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5176; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5177;
5178; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
5179; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5180; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5181; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5182; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5183; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5184; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5185; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5186; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5187; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5188; GFX90A-TGSPLIT-NEXT:    s_endpgm
5189;
5190;
5191    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5192entry:
5193  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5194  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
5195  %val0 = extractvalue { i32, i1 } %val, 0
5196  store i32 %val0, i32 addrspace(3)* %out, align 4
5197  ret void
5198}
5199
5200define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
5201; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5202; GFX6:       ; %bb.0: ; %entry
5203; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5204; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5205; GFX6-NEXT:    s_mov_b32 m0, -1
5206; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5207; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5208; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5209; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5210; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5211; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5212; GFX6-NEXT:    ds_write_b32 v0, v1
5213; GFX6-NEXT:    s_endpgm
5214;
5215; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5216; GFX7:       ; %bb.0: ; %entry
5217; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5218; GFX7-NEXT:    s_mov_b32 m0, -1
5219; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5220; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5221; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5222; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5223; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5224; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5225; GFX7-NEXT:    ds_write_b32 v0, v1
5226; GFX7-NEXT:    s_endpgm
5227;
5228; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5229; GFX10-WGP:       ; %bb.0: ; %entry
5230; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5231; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5232; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5233; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5234; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5235; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5236; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5237; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5238; GFX10-WGP-NEXT:    s_endpgm
5239;
5240; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5241; GFX10-CU:       ; %bb.0: ; %entry
5242; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5243; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5244; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5245; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5246; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5247; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5248; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5249; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5250; GFX10-CU-NEXT:    s_endpgm
5251;
5252; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5253; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5254; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5255; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5256; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5257; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5258; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5259; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5260; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5261; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5262; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5263; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5264; SKIP-CACHE-INV-NEXT:    s_endpgm
5265;
5266; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5267; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5268; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5269; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5270; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5271; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5272; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5273; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5274; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5275; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5276; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5277;
5278; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
5279; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5280; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5281; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5282; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5283; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5284; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5285; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5286; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5287; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5288; GFX90A-TGSPLIT-NEXT:    s_endpgm
5289;
5290;
5291    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5292entry:
5293  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5294  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
5295  %val0 = extractvalue { i32, i1 } %val, 0
5296  store i32 %val0, i32 addrspace(3)* %out, align 4
5297  ret void
5298}
5299
5300define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
5301; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5302; GFX6:       ; %bb.0: ; %entry
5303; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5304; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5305; GFX6-NEXT:    s_mov_b32 m0, -1
5306; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5307; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5308; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5309; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5310; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5311; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5312; GFX6-NEXT:    ds_write_b32 v0, v1
5313; GFX6-NEXT:    s_endpgm
5314;
5315; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5316; GFX7:       ; %bb.0: ; %entry
5317; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5318; GFX7-NEXT:    s_mov_b32 m0, -1
5319; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5320; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5321; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5322; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5323; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5324; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5325; GFX7-NEXT:    ds_write_b32 v0, v1
5326; GFX7-NEXT:    s_endpgm
5327;
5328; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5329; GFX10-WGP:       ; %bb.0: ; %entry
5330; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5331; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5332; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5333; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5334; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5335; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5336; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5337; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5338; GFX10-WGP-NEXT:    s_endpgm
5339;
5340; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5341; GFX10-CU:       ; %bb.0: ; %entry
5342; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5343; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5344; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5345; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5346; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5347; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5348; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5349; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5350; GFX10-CU-NEXT:    s_endpgm
5351;
5352; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5353; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5354; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5355; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5356; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5357; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5361; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5362; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5363; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5364; SKIP-CACHE-INV-NEXT:    s_endpgm
5365;
5366; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5367; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5368; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5369; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5370; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5371; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5372; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5373; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5374; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5375; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5376; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5377;
5378; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
5379; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5380; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5381; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5382; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5383; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5384; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5385; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5386; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5387; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5388; GFX90A-TGSPLIT-NEXT:    s_endpgm
5389;
5390;
5391    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5392entry:
5393  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5394  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
5395  %val0 = extractvalue { i32, i1 } %val, 0
5396  store i32 %val0, i32 addrspace(3)* %out, align 4
5397  ret void
5398}
5399
5400define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg(
5401; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5402; GFX6:       ; %bb.0: ; %entry
5403; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5404; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5405; GFX6-NEXT:    s_mov_b32 m0, -1
5406; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5407; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5408; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5409; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5410; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5411; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5412; GFX6-NEXT:    ds_write_b32 v0, v1
5413; GFX6-NEXT:    s_endpgm
5414;
5415; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5416; GFX7:       ; %bb.0: ; %entry
5417; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5418; GFX7-NEXT:    s_mov_b32 m0, -1
5419; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5420; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5421; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5422; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5423; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5424; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5425; GFX7-NEXT:    ds_write_b32 v0, v1
5426; GFX7-NEXT:    s_endpgm
5427;
5428; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5429; GFX10-WGP:       ; %bb.0: ; %entry
5430; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5431; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5432; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5433; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5434; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5435; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5436; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5437; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5438; GFX10-WGP-NEXT:    s_endpgm
5439;
5440; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5441; GFX10-CU:       ; %bb.0: ; %entry
5442; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5443; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5444; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5445; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5446; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5447; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5448; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5449; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5450; GFX10-CU-NEXT:    s_endpgm
5451;
5452; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5453; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5454; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5455; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5456; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5457; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5458; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5459; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5460; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5461; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5462; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5463; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5464; SKIP-CACHE-INV-NEXT:    s_endpgm
5465;
5466; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5467; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5468; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5469; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5470; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5471; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5472; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5473; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5474; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5475; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5476; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5477;
5478; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
5479; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5480; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5481; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5482; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5483; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5484; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5485; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5486; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5487; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5488; GFX90A-TGSPLIT-NEXT:    s_endpgm
5489;
5490;
5491    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5492entry:
5493  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5494  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
5495  %val0 = extractvalue { i32, i1 } %val, 0
5496  store i32 %val0, i32 addrspace(3)* %out, align 4
5497  ret void
5498}
5499
5500define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg(
5501; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5502; GFX6:       ; %bb.0: ; %entry
5503; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5504; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5505; GFX6-NEXT:    s_mov_b32 m0, -1
5506; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5507; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5508; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5509; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5510; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5511; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5512; GFX6-NEXT:    ds_write_b32 v0, v1
5513; GFX6-NEXT:    s_endpgm
5514;
5515; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5516; GFX7:       ; %bb.0: ; %entry
5517; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5518; GFX7-NEXT:    s_mov_b32 m0, -1
5519; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5520; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5521; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5522; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5523; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5524; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5525; GFX7-NEXT:    ds_write_b32 v0, v1
5526; GFX7-NEXT:    s_endpgm
5527;
5528; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5529; GFX10-WGP:       ; %bb.0: ; %entry
5530; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5531; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5532; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5533; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5534; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5535; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5536; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5537; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5538; GFX10-WGP-NEXT:    s_endpgm
5539;
5540; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5541; GFX10-CU:       ; %bb.0: ; %entry
5542; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5543; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5544; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5545; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5546; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5547; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5548; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5549; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5550; GFX10-CU-NEXT:    s_endpgm
5551;
5552; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5553; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5554; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5555; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5556; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5557; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5558; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5561; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5562; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5563; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5564; SKIP-CACHE-INV-NEXT:    s_endpgm
5565;
5566; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5567; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5568; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5569; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5570; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5571; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5572; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5573; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5574; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5575; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5576; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5577;
5578; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
5579; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5580; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5581; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5582; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5583; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5585; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5586; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5587; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5588; GFX90A-TGSPLIT-NEXT:    s_endpgm
5589;
5590;
5591    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5592entry:
5593  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5594  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
5595  %val0 = extractvalue { i32, i1 } %val, 0
5596  store i32 %val0, i32 addrspace(3)* %out, align 4
5597  ret void
5598}
5599
5600define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
5601; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5602; GFX6:       ; %bb.0: ; %entry
5603; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5604; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5605; GFX6-NEXT:    s_mov_b32 m0, -1
5606; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5607; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5608; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5609; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5610; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5611; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5612; GFX6-NEXT:    ds_write_b32 v0, v1
5613; GFX6-NEXT:    s_endpgm
5614;
5615; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5616; GFX7:       ; %bb.0: ; %entry
5617; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5618; GFX7-NEXT:    s_mov_b32 m0, -1
5619; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5620; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5621; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5622; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5623; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5624; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5625; GFX7-NEXT:    ds_write_b32 v0, v1
5626; GFX7-NEXT:    s_endpgm
5627;
5628; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5629; GFX10-WGP:       ; %bb.0: ; %entry
5630; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5631; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5632; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5633; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5634; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5635; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5636; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5637; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5638; GFX10-WGP-NEXT:    s_endpgm
5639;
5640; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5641; GFX10-CU:       ; %bb.0: ; %entry
5642; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5643; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5644; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5645; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5646; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5647; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5648; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5649; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5650; GFX10-CU-NEXT:    s_endpgm
5651;
5652; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5653; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5654; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5655; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5656; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5657; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5658; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5659; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5661; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5662; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5663; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5664; SKIP-CACHE-INV-NEXT:    s_endpgm
5665;
5666; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5667; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5668; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5669; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5670; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5671; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5672; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5673; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5674; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5675; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5676; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5677;
5678; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
5679; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5680; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5681; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5682; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5683; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5684; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5685; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5686; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5687; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5688; GFX90A-TGSPLIT-NEXT:    s_endpgm
5689;
5690;
5691    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5692entry:
5693  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5694  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
5695  %val0 = extractvalue { i32, i1 } %val, 0
5696  store i32 %val0, i32 addrspace(3)* %out, align 4
5697  ret void
5698}
5699
5700define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
5701; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5702; GFX6:       ; %bb.0: ; %entry
5703; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5704; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5705; GFX6-NEXT:    s_mov_b32 m0, -1
5706; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5707; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5708; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5709; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5710; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5711; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5712; GFX6-NEXT:    ds_write_b32 v0, v1
5713; GFX6-NEXT:    s_endpgm
5714;
5715; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5716; GFX7:       ; %bb.0: ; %entry
5717; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5718; GFX7-NEXT:    s_mov_b32 m0, -1
5719; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5720; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5721; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5722; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5723; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5724; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5725; GFX7-NEXT:    ds_write_b32 v0, v1
5726; GFX7-NEXT:    s_endpgm
5727;
5728; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5729; GFX10-WGP:       ; %bb.0: ; %entry
5730; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5731; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5732; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5733; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5734; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5735; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5736; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5737; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5738; GFX10-WGP-NEXT:    s_endpgm
5739;
5740; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5741; GFX10-CU:       ; %bb.0: ; %entry
5742; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5743; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5744; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5745; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5746; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5747; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5748; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5749; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5750; GFX10-CU-NEXT:    s_endpgm
5751;
5752; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5753; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5754; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5755; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5756; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5757; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5760; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5761; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5762; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5763; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5764; SKIP-CACHE-INV-NEXT:    s_endpgm
5765;
5766; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5767; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5768; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5769; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5770; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5771; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5772; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5773; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5774; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5775; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5776; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5777;
5778; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
5779; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5780; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5781; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5782; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5783; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5784; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5785; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5786; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5787; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5788; GFX90A-TGSPLIT-NEXT:    s_endpgm
5789;
5790;
5791    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5792entry:
5793  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5794  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
5795  %val0 = extractvalue { i32, i1 } %val, 0
5796  store i32 %val0, i32 addrspace(3)* %out, align 4
5797  ret void
5798}
5799
5800define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
5801; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5802; GFX6:       ; %bb.0: ; %entry
5803; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
5804; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
5805; GFX6-NEXT:    s_mov_b32 m0, -1
5806; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5807; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5808; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5809; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5810; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5811; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5812; GFX6-NEXT:    ds_write_b32 v0, v1
5813; GFX6-NEXT:    s_endpgm
5814;
5815; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5816; GFX7:       ; %bb.0: ; %entry
5817; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5818; GFX7-NEXT:    s_mov_b32 m0, -1
5819; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5820; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5821; GFX7-NEXT:    v_mov_b32_e32 v1, s2
5822; GFX7-NEXT:    v_mov_b32_e32 v2, s1
5823; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5824; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5825; GFX7-NEXT:    ds_write_b32 v0, v1
5826; GFX7-NEXT:    s_endpgm
5827;
5828; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5829; GFX10-WGP:       ; %bb.0: ; %entry
5830; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5831; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5832; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5833; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
5834; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
5835; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5836; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5837; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
5838; GFX10-WGP-NEXT:    s_endpgm
5839;
5840; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5841; GFX10-CU:       ; %bb.0: ; %entry
5842; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5843; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5844; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5845; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
5846; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
5847; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5848; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5849; GFX10-CU-NEXT:    ds_write_b32 v0, v1
5850; GFX10-CU-NEXT:    s_endpgm
5851;
5852; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5853; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5854; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5855; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5856; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
5857; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
5860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5861; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5862; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5863; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
5864; SKIP-CACHE-INV-NEXT:    s_endpgm
5865;
5866; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5867; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5868; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5870; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5871; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5872; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5873; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5874; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5875; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
5876; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5877;
5878; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
5879; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5880; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5881; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5882; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5883; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5884; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
5885; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
5886; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5887; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
5888; GFX90A-TGSPLIT-NEXT:    s_endpgm
5889;
5890;
5891    i32 addrspace(3)* %out, i32 %in, i32 %old) {
5892entry:
5893  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
5894  %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
5895  %val0 = extractvalue { i32, i1 } %val, 0
5896  store i32 %val0, i32 addrspace(3)* %out, align 4
5897  ret void
5898}
5899
5900