1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8
9define amdgpu_kernel void @flat_singlethread_unordered_load(
10; GFX7-LABEL: flat_singlethread_unordered_load:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
13; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX7-NEXT:    v_mov_b32_e32 v0, s0
15; GFX7-NEXT:    v_mov_b32_e32 v1, s1
16; GFX7-NEXT:    flat_load_dword v0, v[0:1]
17; GFX7-NEXT:    v_mov_b32_e32 v2, s2
18; GFX7-NEXT:    v_mov_b32_e32 v3, s3
19; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20; GFX7-NEXT:    flat_store_dword v[2:3], v0
21; GFX7-NEXT:    s_endpgm
22;
23; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
24; GFX10-WGP:       ; %bb.0: ; %entry
25; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
26; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
28; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
29; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
30; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
31; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
32; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
33; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
34; GFX10-WGP-NEXT:    s_endpgm
35;
36; GFX10-CU-LABEL: flat_singlethread_unordered_load:
37; GFX10-CU:       ; %bb.0: ; %entry
38; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
39; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
43; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
44; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
45; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
46; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
47; GFX10-CU-NEXT:    s_endpgm
48;
49; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load:
50; SKIP-CACHE-INV:       ; %bb.0: ; %entry
51; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
52; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
53; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
54; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
55; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
56; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
57; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
58; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
59; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
60; SKIP-CACHE-INV-NEXT:    s_endpgm
61;
62; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
63; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
64; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
65; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
67; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
68; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
69; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
70; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
71; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
72; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
73; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
74;
75; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
76; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
77; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
78; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
80; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
81; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
82; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
83; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
84; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
85; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
86; GFX90A-TGSPLIT-NEXT:    s_endpgm
87    i32* %in, i32* %out) {
88entry:
89  %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
90  store i32 %val, i32* %out
91  ret void
92}
93
94define amdgpu_kernel void @flat_singlethread_monotonic_load(
95; GFX7-LABEL: flat_singlethread_monotonic_load:
96; GFX7:       ; %bb.0: ; %entry
97; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
98; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX7-NEXT:    v_mov_b32_e32 v0, s0
100; GFX7-NEXT:    v_mov_b32_e32 v1, s1
101; GFX7-NEXT:    flat_load_dword v0, v[0:1]
102; GFX7-NEXT:    v_mov_b32_e32 v2, s2
103; GFX7-NEXT:    v_mov_b32_e32 v3, s3
104; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
105; GFX7-NEXT:    flat_store_dword v[2:3], v0
106; GFX7-NEXT:    s_endpgm
107;
108; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
109; GFX10-WGP:       ; %bb.0: ; %entry
110; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
111; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
113; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
114; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
115; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
116; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
117; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
118; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
119; GFX10-WGP-NEXT:    s_endpgm
120;
121; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
122; GFX10-CU:       ; %bb.0: ; %entry
123; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
124; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
126; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
127; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
128; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
129; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
130; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
132; GFX10-CU-NEXT:    s_endpgm
133;
134; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load:
135; SKIP-CACHE-INV:       ; %bb.0: ; %entry
136; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
137; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
138; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
139; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
140; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
143; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
144; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
145; SKIP-CACHE-INV-NEXT:    s_endpgm
146;
147; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
148; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
149; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
150; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
152; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
153; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
154; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
155; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
156; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
157; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
158; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
159;
160; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
161; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
162; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
163; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
165; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
166; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
167; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
168; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
169; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
170; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
171; GFX90A-TGSPLIT-NEXT:    s_endpgm
172    i32* %in, i32* %out) {
173entry:
174  %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
175  store i32 %val, i32* %out
176  ret void
177}
178
179define amdgpu_kernel void @flat_singlethread_acquire_load(
180; GFX7-LABEL: flat_singlethread_acquire_load:
181; GFX7:       ; %bb.0: ; %entry
182; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
183; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX7-NEXT:    v_mov_b32_e32 v0, s0
185; GFX7-NEXT:    v_mov_b32_e32 v1, s1
186; GFX7-NEXT:    flat_load_dword v0, v[0:1]
187; GFX7-NEXT:    v_mov_b32_e32 v2, s2
188; GFX7-NEXT:    v_mov_b32_e32 v3, s3
189; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
190; GFX7-NEXT:    flat_store_dword v[2:3], v0
191; GFX7-NEXT:    s_endpgm
192;
193; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
194; GFX10-WGP:       ; %bb.0: ; %entry
195; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
196; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
199; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
200; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
201; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
202; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
203; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
204; GFX10-WGP-NEXT:    s_endpgm
205;
206; GFX10-CU-LABEL: flat_singlethread_acquire_load:
207; GFX10-CU:       ; %bb.0: ; %entry
208; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
209; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
211; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
212; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
213; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
214; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
215; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
216; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
217; GFX10-CU-NEXT:    s_endpgm
218;
219; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load:
220; SKIP-CACHE-INV:       ; %bb.0: ; %entry
221; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
222; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
225; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
226; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
227; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
228; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
229; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
230; SKIP-CACHE-INV-NEXT:    s_endpgm
231;
232; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
233; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
234; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
235; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
237; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
238; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
239; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
240; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
241; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
242; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
243; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
244;
245; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
246; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
247; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
248; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
250; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
251; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
252; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
253; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
254; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
255; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
256; GFX90A-TGSPLIT-NEXT:    s_endpgm
257    i32* %in, i32* %out) {
258entry:
259  %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
260  store i32 %val, i32* %out
261  ret void
262}
263
264define amdgpu_kernel void @flat_singlethread_seq_cst_load(
265; GFX7-LABEL: flat_singlethread_seq_cst_load:
266; GFX7:       ; %bb.0: ; %entry
267; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
268; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX7-NEXT:    v_mov_b32_e32 v0, s0
270; GFX7-NEXT:    v_mov_b32_e32 v1, s1
271; GFX7-NEXT:    flat_load_dword v0, v[0:1]
272; GFX7-NEXT:    v_mov_b32_e32 v2, s2
273; GFX7-NEXT:    v_mov_b32_e32 v3, s3
274; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
275; GFX7-NEXT:    flat_store_dword v[2:3], v0
276; GFX7-NEXT:    s_endpgm
277;
278; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
279; GFX10-WGP:       ; %bb.0: ; %entry
280; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
281; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
283; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
284; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
285; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
286; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
287; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
288; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
289; GFX10-WGP-NEXT:    s_endpgm
290;
291; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
292; GFX10-CU:       ; %bb.0: ; %entry
293; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
294; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
296; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
297; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
298; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
299; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
300; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
301; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
302; GFX10-CU-NEXT:    s_endpgm
303;
304; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load:
305; SKIP-CACHE-INV:       ; %bb.0: ; %entry
306; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
307; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
308; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
310; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
313; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
314; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
315; SKIP-CACHE-INV-NEXT:    s_endpgm
316;
317; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
318; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
319; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
320; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
322; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
323; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
324; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
325; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
326; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
327; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
328; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
329;
330; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
331; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
332; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
333; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
335; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
336; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
337; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
338; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
339; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
340; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
341; GFX90A-TGSPLIT-NEXT:    s_endpgm
342    i32* %in, i32* %out) {
343entry:
344  %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
345  store i32 %val, i32* %out
346  ret void
347}
348
349define amdgpu_kernel void @flat_singlethread_unordered_store(
350; GFX7-LABEL: flat_singlethread_unordered_store:
351; GFX7:       ; %bb.0: ; %entry
352; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
353; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    v_mov_b32_e32 v2, s2
356; GFX7-NEXT:    v_mov_b32_e32 v0, s0
357; GFX7-NEXT:    v_mov_b32_e32 v1, s1
358; GFX7-NEXT:    flat_store_dword v[0:1], v2
359; GFX7-NEXT:    s_endpgm
360;
361; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
362; GFX10-WGP:       ; %bb.0: ; %entry
363; GFX10-WGP-NEXT:    s_clause 0x1
364; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
365; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
366; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
368; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
369; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
370; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
371; GFX10-WGP-NEXT:    s_endpgm
372;
373; GFX10-CU-LABEL: flat_singlethread_unordered_store:
374; GFX10-CU:       ; %bb.0: ; %entry
375; GFX10-CU-NEXT:    s_clause 0x1
376; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
377; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
378; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
380; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
381; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
382; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
383; GFX10-CU-NEXT:    s_endpgm
384;
385; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store:
386; SKIP-CACHE-INV:       ; %bb.0: ; %entry
387; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
388; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
389; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
390; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
393; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
394; SKIP-CACHE-INV-NEXT:    s_endpgm
395;
396; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
397; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
398; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
399; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
400; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
402; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
403; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
404; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
405;
406; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
407; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
408; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
409; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
410; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
412; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
413; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
414; GFX90A-TGSPLIT-NEXT:    s_endpgm
415    i32 %in, i32* %out) {
416entry:
417  store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
418  ret void
419}
420
421define amdgpu_kernel void @flat_singlethread_monotonic_store(
422; GFX7-LABEL: flat_singlethread_monotonic_store:
423; GFX7:       ; %bb.0: ; %entry
424; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
425; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
426; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX7-NEXT:    v_mov_b32_e32 v2, s2
428; GFX7-NEXT:    v_mov_b32_e32 v0, s0
429; GFX7-NEXT:    v_mov_b32_e32 v1, s1
430; GFX7-NEXT:    flat_store_dword v[0:1], v2
431; GFX7-NEXT:    s_endpgm
432;
433; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
434; GFX10-WGP:       ; %bb.0: ; %entry
435; GFX10-WGP-NEXT:    s_clause 0x1
436; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
437; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
438; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
440; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
441; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
442; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
443; GFX10-WGP-NEXT:    s_endpgm
444;
445; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
446; GFX10-CU:       ; %bb.0: ; %entry
447; GFX10-CU-NEXT:    s_clause 0x1
448; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
449; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
450; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
452; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
453; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
454; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
455; GFX10-CU-NEXT:    s_endpgm
456;
457; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store:
458; SKIP-CACHE-INV:       ; %bb.0: ; %entry
459; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
460; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
461; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
462; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
463; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
464; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
465; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
466; SKIP-CACHE-INV-NEXT:    s_endpgm
467;
468; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
469; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
470; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
471; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
472; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
474; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
475; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
476; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
477;
478; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
479; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
480; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
481; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
482; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
484; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
485; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
486; GFX90A-TGSPLIT-NEXT:    s_endpgm
487    i32 %in, i32* %out) {
488entry:
489  store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
490  ret void
491}
492
493define amdgpu_kernel void @flat_singlethread_release_store(
494; GFX7-LABEL: flat_singlethread_release_store:
495; GFX7:       ; %bb.0: ; %entry
496; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
497; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
498; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX7-NEXT:    v_mov_b32_e32 v2, s2
500; GFX7-NEXT:    v_mov_b32_e32 v0, s0
501; GFX7-NEXT:    v_mov_b32_e32 v1, s1
502; GFX7-NEXT:    flat_store_dword v[0:1], v2
503; GFX7-NEXT:    s_endpgm
504;
505; GFX10-WGP-LABEL: flat_singlethread_release_store:
506; GFX10-WGP:       ; %bb.0: ; %entry
507; GFX10-WGP-NEXT:    s_clause 0x1
508; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
509; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
510; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
512; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
513; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
514; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
515; GFX10-WGP-NEXT:    s_endpgm
516;
517; GFX10-CU-LABEL: flat_singlethread_release_store:
518; GFX10-CU:       ; %bb.0: ; %entry
519; GFX10-CU-NEXT:    s_clause 0x1
520; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
521; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
522; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
525; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
526; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
527; GFX10-CU-NEXT:    s_endpgm
528;
529; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store:
530; SKIP-CACHE-INV:       ; %bb.0: ; %entry
531; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
532; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
533; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
537; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
538; SKIP-CACHE-INV-NEXT:    s_endpgm
539;
540; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
541; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
542; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
543; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
544; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
545; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
546; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
547; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
548; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
549;
550; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
551; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
552; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
553; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
554; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
556; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
557; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
558; GFX90A-TGSPLIT-NEXT:    s_endpgm
559    i32 %in, i32* %out) {
560entry:
561  store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
562  ret void
563}
564
565define amdgpu_kernel void @flat_singlethread_seq_cst_store(
566; GFX7-LABEL: flat_singlethread_seq_cst_store:
567; GFX7:       ; %bb.0: ; %entry
568; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
569; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
570; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX7-NEXT:    v_mov_b32_e32 v2, s2
572; GFX7-NEXT:    v_mov_b32_e32 v0, s0
573; GFX7-NEXT:    v_mov_b32_e32 v1, s1
574; GFX7-NEXT:    flat_store_dword v[0:1], v2
575; GFX7-NEXT:    s_endpgm
576;
577; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
578; GFX10-WGP:       ; %bb.0: ; %entry
579; GFX10-WGP-NEXT:    s_clause 0x1
580; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
581; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
582; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
584; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
585; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
586; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
587; GFX10-WGP-NEXT:    s_endpgm
588;
589; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
590; GFX10-CU:       ; %bb.0: ; %entry
591; GFX10-CU-NEXT:    s_clause 0x1
592; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
593; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
594; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
596; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
597; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
598; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
599; GFX10-CU-NEXT:    s_endpgm
600;
601; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store:
602; SKIP-CACHE-INV:       ; %bb.0: ; %entry
603; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
604; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
605; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
606; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
607; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
608; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
609; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
610; SKIP-CACHE-INV-NEXT:    s_endpgm
611;
612; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
613; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
614; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
615; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
616; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
618; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
619; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
620; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
621;
622; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
623; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
624; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
625; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
626; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
628; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
629; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
630; GFX90A-TGSPLIT-NEXT:    s_endpgm
631    i32 %in, i32* %out) {
632entry:
633  store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
634  ret void
635}
636
637define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
638; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
639; GFX7:       ; %bb.0: ; %entry
640; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
641; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
642; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX7-NEXT:    v_mov_b32_e32 v0, s0
644; GFX7-NEXT:    v_mov_b32_e32 v1, s1
645; GFX7-NEXT:    v_mov_b32_e32 v2, s2
646; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
647; GFX7-NEXT:    s_endpgm
648;
649; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
650; GFX10-WGP:       ; %bb.0: ; %entry
651; GFX10-WGP-NEXT:    s_clause 0x1
652; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
653; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
654; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
656; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
657; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
658; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
659; GFX10-WGP-NEXT:    s_endpgm
660;
661; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
662; GFX10-CU:       ; %bb.0: ; %entry
663; GFX10-CU-NEXT:    s_clause 0x1
664; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
665; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
666; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
668; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
669; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
670; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
671; GFX10-CU-NEXT:    s_endpgm
672;
673; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw:
674; SKIP-CACHE-INV:       ; %bb.0: ; %entry
675; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
676; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
677; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
679; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
681; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
682; SKIP-CACHE-INV-NEXT:    s_endpgm
683;
684; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
685; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
686; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
687; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
688; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
690; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
691; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
692; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
693;
694; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
695; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
696; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
697; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
698; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
700; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
701; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
702; GFX90A-TGSPLIT-NEXT:    s_endpgm
703    i32* %out, i32 %in) {
704entry:
705  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
706  ret void
707}
708
709define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
710; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
711; GFX7:       ; %bb.0: ; %entry
712; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
713; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
714; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX7-NEXT:    v_mov_b32_e32 v0, s0
716; GFX7-NEXT:    v_mov_b32_e32 v1, s1
717; GFX7-NEXT:    v_mov_b32_e32 v2, s2
718; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
719; GFX7-NEXT:    s_endpgm
720;
721; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
722; GFX10-WGP:       ; %bb.0: ; %entry
723; GFX10-WGP-NEXT:    s_clause 0x1
724; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
725; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
726; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
728; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
729; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
730; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
731; GFX10-WGP-NEXT:    s_endpgm
732;
733; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
734; GFX10-CU:       ; %bb.0: ; %entry
735; GFX10-CU-NEXT:    s_clause 0x1
736; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
737; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
738; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
740; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
741; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
742; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
743; GFX10-CU-NEXT:    s_endpgm
744;
745; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw:
746; SKIP-CACHE-INV:       ; %bb.0: ; %entry
747; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
748; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
749; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
750; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
751; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
753; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
754; SKIP-CACHE-INV-NEXT:    s_endpgm
755;
756; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
757; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
758; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
759; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
760; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
761; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
762; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
763; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
764; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
765;
766; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
767; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
768; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
769; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
770; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
772; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
773; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
774; GFX90A-TGSPLIT-NEXT:    s_endpgm
775    i32* %out, i32 %in) {
776entry:
777  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
778  ret void
779}
780
781define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
782; GFX7-LABEL: flat_singlethread_release_atomicrmw:
783; GFX7:       ; %bb.0: ; %entry
784; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
785; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
786; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX7-NEXT:    v_mov_b32_e32 v0, s0
788; GFX7-NEXT:    v_mov_b32_e32 v1, s1
789; GFX7-NEXT:    v_mov_b32_e32 v2, s2
790; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
791; GFX7-NEXT:    s_endpgm
792;
793; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
794; GFX10-WGP:       ; %bb.0: ; %entry
795; GFX10-WGP-NEXT:    s_clause 0x1
796; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
797; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
798; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
800; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
801; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
802; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
803; GFX10-WGP-NEXT:    s_endpgm
804;
805; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
806; GFX10-CU:       ; %bb.0: ; %entry
807; GFX10-CU-NEXT:    s_clause 0x1
808; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
809; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
810; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
812; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
813; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
814; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
815; GFX10-CU-NEXT:    s_endpgm
816;
817; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw:
818; SKIP-CACHE-INV:       ; %bb.0: ; %entry
819; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
820; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
821; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
822; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
823; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
824; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
825; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
826; SKIP-CACHE-INV-NEXT:    s_endpgm
827;
828; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
829; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
830; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
831; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
832; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
834; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
835; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
836; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
837;
838; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
839; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
840; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
841; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
842; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
844; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
845; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
846; GFX90A-TGSPLIT-NEXT:    s_endpgm
847    i32* %out, i32 %in) {
848entry:
849  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
850  ret void
851}
852
853define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
854; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
855; GFX7:       ; %bb.0: ; %entry
856; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
857; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
858; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX7-NEXT:    v_mov_b32_e32 v0, s0
860; GFX7-NEXT:    v_mov_b32_e32 v1, s1
861; GFX7-NEXT:    v_mov_b32_e32 v2, s2
862; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
863; GFX7-NEXT:    s_endpgm
864;
865; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
866; GFX10-WGP:       ; %bb.0: ; %entry
867; GFX10-WGP-NEXT:    s_clause 0x1
868; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
869; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
870; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
872; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
873; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
874; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
875; GFX10-WGP-NEXT:    s_endpgm
876;
877; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
878; GFX10-CU:       ; %bb.0: ; %entry
879; GFX10-CU-NEXT:    s_clause 0x1
880; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
881; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
882; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
884; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
885; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
886; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
887; GFX10-CU-NEXT:    s_endpgm
888;
889; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw:
890; SKIP-CACHE-INV:       ; %bb.0: ; %entry
891; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
892; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
893; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
895; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
896; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
897; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
898; SKIP-CACHE-INV-NEXT:    s_endpgm
899;
900; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
901; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
902; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
903; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
904; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
906; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
907; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
908; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
909;
910; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
911; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
912; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
913; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
914; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
916; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
917; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
918; GFX90A-TGSPLIT-NEXT:    s_endpgm
919    i32* %out, i32 %in) {
920entry:
921  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
922  ret void
923}
924
925define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
926; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
927; GFX7:       ; %bb.0: ; %entry
928; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
929; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
930; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX7-NEXT:    v_mov_b32_e32 v0, s0
932; GFX7-NEXT:    v_mov_b32_e32 v1, s1
933; GFX7-NEXT:    v_mov_b32_e32 v2, s2
934; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
935; GFX7-NEXT:    s_endpgm
936;
937; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
938; GFX10-WGP:       ; %bb.0: ; %entry
939; GFX10-WGP-NEXT:    s_clause 0x1
940; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
941; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
942; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
943; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
944; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
945; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
946; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
947; GFX10-WGP-NEXT:    s_endpgm
948;
949; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
950; GFX10-CU:       ; %bb.0: ; %entry
951; GFX10-CU-NEXT:    s_clause 0x1
952; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
953; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
954; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
956; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
957; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
958; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
959; GFX10-CU-NEXT:    s_endpgm
960;
961; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw:
962; SKIP-CACHE-INV:       ; %bb.0: ; %entry
963; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
964; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
965; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
967; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
968; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
969; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
970; SKIP-CACHE-INV-NEXT:    s_endpgm
971;
972; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
973; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
974; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
975; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
976; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
978; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
979; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
980; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
981;
982; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
983; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
984; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
985; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
986; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
988; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
989; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
990; GFX90A-TGSPLIT-NEXT:    s_endpgm
991    i32* %out, i32 %in) {
992entry:
993  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
994  ret void
995}
996
997define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
998; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
999; GFX7:       ; %bb.0: ; %entry
1000; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1001; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1004; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1005; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1006; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1007; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1008; GFX7-NEXT:    flat_store_dword v[0:1], v2
1009; GFX7-NEXT:    s_endpgm
1010;
1011; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1012; GFX10-WGP:       ; %bb.0: ; %entry
1013; GFX10-WGP-NEXT:    s_clause 0x1
1014; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1015; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1016; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1018; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1019; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1020; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1021; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1022; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1023; GFX10-WGP-NEXT:    s_endpgm
1024;
1025; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1026; GFX10-CU:       ; %bb.0: ; %entry
1027; GFX10-CU-NEXT:    s_clause 0x1
1028; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1029; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1030; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1032; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1033; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1034; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1035; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1036; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1037; GFX10-CU-NEXT:    s_endpgm
1038;
1039; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1040; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1041; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1042; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1043; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1044; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1045; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1046; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1047; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1048; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1049; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1050; SKIP-CACHE-INV-NEXT:    s_endpgm
1051;
1052; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1053; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1054; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1055; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1056; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1057; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1058; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1059; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1060; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1061; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1062; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1063;
1064; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
1065; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1066; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1067; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1068; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1070; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1071; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1072; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1073; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1074; GFX90A-TGSPLIT-NEXT:    s_endpgm
1075    i32* %out, i32 %in) {
1076entry:
1077  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
1078  store i32 %val, i32* %out, align 4
1079  ret void
1080}
1081
1082define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
1083; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1084; GFX7:       ; %bb.0: ; %entry
1085; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1086; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1087; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1089; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1090; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1091; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1092; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1093; GFX7-NEXT:    flat_store_dword v[0:1], v2
1094; GFX7-NEXT:    s_endpgm
1095;
1096; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1097; GFX10-WGP:       ; %bb.0: ; %entry
1098; GFX10-WGP-NEXT:    s_clause 0x1
1099; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1100; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1101; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1103; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1104; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1105; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1106; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1107; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1108; GFX10-WGP-NEXT:    s_endpgm
1109;
1110; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1111; GFX10-CU:       ; %bb.0: ; %entry
1112; GFX10-CU-NEXT:    s_clause 0x1
1113; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1114; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1115; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1117; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1118; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1119; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1120; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1121; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1122; GFX10-CU-NEXT:    s_endpgm
1123;
1124; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1125; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1126; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1127; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1128; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1129; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1130; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1131; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1132; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1133; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1134; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1135; SKIP-CACHE-INV-NEXT:    s_endpgm
1136;
1137; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1138; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1139; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1140; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1141; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1142; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1143; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1144; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1145; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1146; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1147; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1148;
1149; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
1150; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1151; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1152; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1153; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1155; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1156; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1157; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1158; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1159; GFX90A-TGSPLIT-NEXT:    s_endpgm
1160    i32* %out, i32 %in) {
1161entry:
1162  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
1163  store i32 %val, i32* %out, align 4
1164  ret void
1165}
1166
1167define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
1168; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1169; GFX7:       ; %bb.0: ; %entry
1170; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1171; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1172; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1174; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1175; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1176; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1177; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1178; GFX7-NEXT:    flat_store_dword v[0:1], v2
1179; GFX7-NEXT:    s_endpgm
1180;
1181; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1182; GFX10-WGP:       ; %bb.0: ; %entry
1183; GFX10-WGP-NEXT:    s_clause 0x1
1184; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1185; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1186; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1188; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1189; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1190; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1191; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1192; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1193; GFX10-WGP-NEXT:    s_endpgm
1194;
1195; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1196; GFX10-CU:       ; %bb.0: ; %entry
1197; GFX10-CU-NEXT:    s_clause 0x1
1198; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1199; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1200; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1202; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1203; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1204; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1205; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1206; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1207; GFX10-CU-NEXT:    s_endpgm
1208;
1209; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1210; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1211; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1212; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1213; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1217; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1218; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1219; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1220; SKIP-CACHE-INV-NEXT:    s_endpgm
1221;
1222; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1223; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1224; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1225; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1226; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1227; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1228; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1229; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1230; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1231; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1232; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1233;
1234; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
1235; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1236; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1237; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1238; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1239; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1240; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1241; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1242; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1243; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1244; GFX90A-TGSPLIT-NEXT:    s_endpgm
1245    i32* %out, i32 %in) {
1246entry:
1247  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
1248  store i32 %val, i32* %out, align 4
1249  ret void
1250}
1251
1252define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
1253; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1254; GFX7:       ; %bb.0: ; %entry
1255; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1256; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1257; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX7-NEXT:    s_add_u32 s0, s0, 16
1259; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1260; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1261; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1262; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1264; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1265; GFX7-NEXT:    s_endpgm
1266;
1267; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1268; GFX10-WGP:       ; %bb.0: ; %entry
1269; GFX10-WGP-NEXT:    s_clause 0x1
1270; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1271; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1272; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1274; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1275; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1276; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1277; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1278; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1279; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1280; GFX10-WGP-NEXT:    s_endpgm
1281;
1282; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1283; GFX10-CU:       ; %bb.0: ; %entry
1284; GFX10-CU-NEXT:    s_clause 0x1
1285; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1286; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1287; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1289; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1290; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1291; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1292; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1293; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1294; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1295; GFX10-CU-NEXT:    s_endpgm
1296;
1297; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1298; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1299; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1300; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1301; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1302; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1303; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1304; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1305; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1308; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1309; SKIP-CACHE-INV-NEXT:    s_endpgm
1310;
1311; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1312; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1313; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1314; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1315; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1316; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1317; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1318; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1319; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1320;
1321; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
1322; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1323; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1324; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1325; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1327; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1328; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1329; GFX90A-TGSPLIT-NEXT:    s_endpgm
1330    i32* %out, i32 %in, i32 %old) {
1331entry:
1332  %gep = getelementptr i32, i32* %out, i32 4
1333  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
1334  ret void
1335}
1336
1337define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
1338; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1339; GFX7:       ; %bb.0: ; %entry
1340; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1341; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1342; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX7-NEXT:    s_add_u32 s0, s0, 16
1344; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1345; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1346; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1347; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1348; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1349; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1350; GFX7-NEXT:    s_endpgm
1351;
1352; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1353; GFX10-WGP:       ; %bb.0: ; %entry
1354; GFX10-WGP-NEXT:    s_clause 0x1
1355; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1356; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1357; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1359; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1360; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1361; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1362; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1363; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1364; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1365; GFX10-WGP-NEXT:    s_endpgm
1366;
1367; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1368; GFX10-CU:       ; %bb.0: ; %entry
1369; GFX10-CU-NEXT:    s_clause 0x1
1370; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1371; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1372; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1374; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1375; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1376; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1377; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1378; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1379; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1380; GFX10-CU-NEXT:    s_endpgm
1381;
1382; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1383; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1384; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1386; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1387; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1388; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1389; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1390; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1393; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1394; SKIP-CACHE-INV-NEXT:    s_endpgm
1395;
1396; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1397; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1398; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1399; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1400; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1401; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1402; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1403; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1404; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1405;
1406; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
1407; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1408; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1409; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1410; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1412; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1413; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1414; GFX90A-TGSPLIT-NEXT:    s_endpgm
1415    i32* %out, i32 %in, i32 %old) {
1416entry:
1417  %gep = getelementptr i32, i32* %out, i32 4
1418  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
1419  ret void
1420}
1421
1422define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
1423; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1424; GFX7:       ; %bb.0: ; %entry
1425; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1426; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1427; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX7-NEXT:    s_add_u32 s0, s0, 16
1429; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1430; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1431; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1432; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1433; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1434; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1435; GFX7-NEXT:    s_endpgm
1436;
1437; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1438; GFX10-WGP:       ; %bb.0: ; %entry
1439; GFX10-WGP-NEXT:    s_clause 0x1
1440; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1441; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1442; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1444; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1445; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1446; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1447; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1448; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1449; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1450; GFX10-WGP-NEXT:    s_endpgm
1451;
1452; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1453; GFX10-CU:       ; %bb.0: ; %entry
1454; GFX10-CU-NEXT:    s_clause 0x1
1455; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1456; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1457; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1459; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1460; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1461; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1462; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1463; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1464; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1465; GFX10-CU-NEXT:    s_endpgm
1466;
1467; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1468; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1469; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1470; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1471; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1472; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1473; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1475; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1476; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1477; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1478; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1479; SKIP-CACHE-INV-NEXT:    s_endpgm
1480;
1481; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1482; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1483; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1484; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1485; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1487; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1488; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1489; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1490;
1491; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
1492; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1493; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1494; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1495; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1497; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1498; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1499; GFX90A-TGSPLIT-NEXT:    s_endpgm
1500    i32* %out, i32 %in, i32 %old) {
1501entry:
1502  %gep = getelementptr i32, i32* %out, i32 4
1503  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
1504  ret void
1505}
1506
1507define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
1508; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1509; GFX7:       ; %bb.0: ; %entry
1510; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1511; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1512; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX7-NEXT:    s_add_u32 s0, s0, 16
1514; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1515; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1516; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1517; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1518; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1519; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1520; GFX7-NEXT:    s_endpgm
1521;
1522; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1523; GFX10-WGP:       ; %bb.0: ; %entry
1524; GFX10-WGP-NEXT:    s_clause 0x1
1525; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1526; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1527; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1529; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1530; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1531; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1532; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1533; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1534; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1535; GFX10-WGP-NEXT:    s_endpgm
1536;
1537; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1538; GFX10-CU:       ; %bb.0: ; %entry
1539; GFX10-CU-NEXT:    s_clause 0x1
1540; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1541; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1542; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1544; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1545; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1546; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1547; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1548; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1549; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1550; GFX10-CU-NEXT:    s_endpgm
1551;
1552; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1553; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1554; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1555; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1556; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1557; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1558; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1561; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1563; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1564; SKIP-CACHE-INV-NEXT:    s_endpgm
1565;
1566; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1567; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1568; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1569; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1570; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1571; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1572; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1573; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1574; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1575;
1576; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
1577; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1578; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1579; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1580; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1581; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1582; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1583; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1584; GFX90A-TGSPLIT-NEXT:    s_endpgm
1585    i32* %out, i32 %in, i32 %old) {
1586entry:
1587  %gep = getelementptr i32, i32* %out, i32 4
1588  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
1589  ret void
1590}
1591
1592define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
1593; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1594; GFX7:       ; %bb.0: ; %entry
1595; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1596; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1597; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX7-NEXT:    s_add_u32 s0, s0, 16
1599; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1600; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1601; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1602; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1603; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1604; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1605; GFX7-NEXT:    s_endpgm
1606;
1607; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1608; GFX10-WGP:       ; %bb.0: ; %entry
1609; GFX10-WGP-NEXT:    s_clause 0x1
1610; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1611; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1612; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1613; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1614; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1615; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1616; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1617; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1618; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1619; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1620; GFX10-WGP-NEXT:    s_endpgm
1621;
1622; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1623; GFX10-CU:       ; %bb.0: ; %entry
1624; GFX10-CU-NEXT:    s_clause 0x1
1625; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1626; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1627; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1628; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1629; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1630; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1631; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1632; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1633; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1634; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1635; GFX10-CU-NEXT:    s_endpgm
1636;
1637; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1638; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1639; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1640; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1641; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1642; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1643; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1648; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1649; SKIP-CACHE-INV-NEXT:    s_endpgm
1650;
1651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1652; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1655; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1657; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1658; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1659; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1660;
1661; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
1662; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1663; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1664; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1665; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1666; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1667; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1668; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1669; GFX90A-TGSPLIT-NEXT:    s_endpgm
1670    i32* %out, i32 %in, i32 %old) {
1671entry:
1672  %gep = getelementptr i32, i32* %out, i32 4
1673  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
1674  ret void
1675}
1676
1677define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
1678; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1679; GFX7:       ; %bb.0: ; %entry
1680; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1681; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1682; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX7-NEXT:    s_add_u32 s0, s0, 16
1684; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1685; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1686; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1687; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1688; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1689; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1690; GFX7-NEXT:    s_endpgm
1691;
1692; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1693; GFX10-WGP:       ; %bb.0: ; %entry
1694; GFX10-WGP-NEXT:    s_clause 0x1
1695; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1696; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1697; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1698; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1699; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1700; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1701; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1702; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1703; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1704; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1705; GFX10-WGP-NEXT:    s_endpgm
1706;
1707; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1708; GFX10-CU:       ; %bb.0: ; %entry
1709; GFX10-CU-NEXT:    s_clause 0x1
1710; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1711; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1712; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1714; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1715; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1716; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1717; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1718; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1719; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1720; GFX10-CU-NEXT:    s_endpgm
1721;
1722; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1723; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1724; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1725; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1726; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1727; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1728; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1729; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1730; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1731; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1732; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1733; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1734; SKIP-CACHE-INV-NEXT:    s_endpgm
1735;
1736; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1737; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1738; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1739; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1740; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1741; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1742; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1743; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1744; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1745;
1746; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
1747; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1748; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1749; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1750; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1751; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1752; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1753; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1754; GFX90A-TGSPLIT-NEXT:    s_endpgm
1755    i32* %out, i32 %in, i32 %old) {
1756entry:
1757  %gep = getelementptr i32, i32* %out, i32 4
1758  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
1759  ret void
1760}
1761
1762define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
1763; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1764; GFX7:       ; %bb.0: ; %entry
1765; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1766; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1767; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX7-NEXT:    s_add_u32 s0, s0, 16
1769; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1770; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1771; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1772; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1773; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1774; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1775; GFX7-NEXT:    s_endpgm
1776;
1777; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1778; GFX10-WGP:       ; %bb.0: ; %entry
1779; GFX10-WGP-NEXT:    s_clause 0x1
1780; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1781; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1782; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1784; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1785; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1786; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1787; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1788; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1789; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1790; GFX10-WGP-NEXT:    s_endpgm
1791;
1792; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1793; GFX10-CU:       ; %bb.0: ; %entry
1794; GFX10-CU-NEXT:    s_clause 0x1
1795; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1796; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1797; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1799; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1800; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1801; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1802; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1803; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1804; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1805; GFX10-CU-NEXT:    s_endpgm
1806;
1807; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1808; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1809; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1810; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1811; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1812; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1813; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1816; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1817; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1818; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1819; SKIP-CACHE-INV-NEXT:    s_endpgm
1820;
1821; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1822; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1823; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1824; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1825; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1827; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1828; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1829; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1830;
1831; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
1832; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1833; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1834; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1835; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1836; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1837; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1838; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1839; GFX90A-TGSPLIT-NEXT:    s_endpgm
1840    i32* %out, i32 %in, i32 %old) {
1841entry:
1842  %gep = getelementptr i32, i32* %out, i32 4
1843  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
1844  ret void
1845}
1846
1847define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
1848; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
1849; GFX7:       ; %bb.0: ; %entry
1850; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1851; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1852; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX7-NEXT:    s_add_u32 s0, s0, 16
1854; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1855; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1856; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1857; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1858; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1859; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1860; GFX7-NEXT:    s_endpgm
1861;
1862; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
1863; GFX10-WGP:       ; %bb.0: ; %entry
1864; GFX10-WGP-NEXT:    s_clause 0x1
1865; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1866; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1867; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1868; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1869; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1870; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1871; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1872; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1873; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1874; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1875; GFX10-WGP-NEXT:    s_endpgm
1876;
1877; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
1878; GFX10-CU:       ; %bb.0: ; %entry
1879; GFX10-CU-NEXT:    s_clause 0x1
1880; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1881; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1882; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1883; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1884; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1885; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1886; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1887; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1888; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1889; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1890; GFX10-CU-NEXT:    s_endpgm
1891;
1892; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg:
1893; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1894; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1895; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1896; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1897; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1898; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1900; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1901; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1902; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1903; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1904; SKIP-CACHE-INV-NEXT:    s_endpgm
1905;
1906; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
1907; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1909; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1910; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1912; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1913; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1914; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1915;
1916; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
1917; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1918; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1919; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1920; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1922; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1923; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1924; GFX90A-TGSPLIT-NEXT:    s_endpgm
1925    i32* %out, i32 %in, i32 %old) {
1926entry:
1927  %gep = getelementptr i32, i32* %out, i32 4
1928  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
1929  ret void
1930}
1931
1932define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
1933; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
1934; GFX7:       ; %bb.0: ; %entry
1935; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1936; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1937; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX7-NEXT:    s_add_u32 s0, s0, 16
1939; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1940; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1941; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1942; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1943; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1944; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1945; GFX7-NEXT:    s_endpgm
1946;
1947; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
1948; GFX10-WGP:       ; %bb.0: ; %entry
1949; GFX10-WGP-NEXT:    s_clause 0x1
1950; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1951; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1952; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1953; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1954; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1955; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1956; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1957; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1958; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1959; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1960; GFX10-WGP-NEXT:    s_endpgm
1961;
1962; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
1963; GFX10-CU:       ; %bb.0: ; %entry
1964; GFX10-CU-NEXT:    s_clause 0x1
1965; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1966; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1967; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1969; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1970; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1971; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1972; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1973; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1974; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1975; GFX10-CU-NEXT:    s_endpgm
1976;
1977; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
1978; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1979; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1980; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1981; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1982; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1983; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1985; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1986; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1988; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1989; SKIP-CACHE-INV-NEXT:    s_endpgm
1990;
1991; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
1992; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1993; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1994; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1995; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1996; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1997; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1998; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1999; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2000;
2001; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
2002; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2003; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2004; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2005; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2006; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2007; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2008; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2009; GFX90A-TGSPLIT-NEXT:    s_endpgm
2010    i32* %out, i32 %in, i32 %old) {
2011entry:
2012  %gep = getelementptr i32, i32* %out, i32 4
2013  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
2014  ret void
2015}
2016
2017define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
2018; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2019; GFX7:       ; %bb.0: ; %entry
2020; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2021; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2022; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2023; GFX7-NEXT:    s_add_u32 s0, s0, 16
2024; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2025; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2026; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2027; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2028; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2029; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2030; GFX7-NEXT:    s_endpgm
2031;
2032; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2033; GFX10-WGP:       ; %bb.0: ; %entry
2034; GFX10-WGP-NEXT:    s_clause 0x1
2035; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2036; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2037; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2038; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2039; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2040; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2041; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2042; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2043; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2044; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2045; GFX10-WGP-NEXT:    s_endpgm
2046;
2047; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2048; GFX10-CU:       ; %bb.0: ; %entry
2049; GFX10-CU-NEXT:    s_clause 0x1
2050; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2051; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2052; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2054; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2055; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2056; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2057; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2058; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2059; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2060; GFX10-CU-NEXT:    s_endpgm
2061;
2062; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2063; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2064; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2065; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2066; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2067; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2068; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2069; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2073; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2074; SKIP-CACHE-INV-NEXT:    s_endpgm
2075;
2076; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2077; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2078; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2079; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2080; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2081; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2082; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2083; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2084; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2085;
2086; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
2087; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2088; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2089; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2090; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2091; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2092; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2093; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2094; GFX90A-TGSPLIT-NEXT:    s_endpgm
2095    i32* %out, i32 %in, i32 %old) {
2096entry:
2097  %gep = getelementptr i32, i32* %out, i32 4
2098  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
2099  ret void
2100}
2101
2102define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
2103; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2104; GFX7:       ; %bb.0: ; %entry
2105; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2106; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2107; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2108; GFX7-NEXT:    s_add_u32 s0, s0, 16
2109; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2110; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2111; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2112; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2113; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2114; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2115; GFX7-NEXT:    s_endpgm
2116;
2117; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2118; GFX10-WGP:       ; %bb.0: ; %entry
2119; GFX10-WGP-NEXT:    s_clause 0x1
2120; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2121; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2122; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2123; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2124; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2125; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2126; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2127; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2128; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2129; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2130; GFX10-WGP-NEXT:    s_endpgm
2131;
2132; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2133; GFX10-CU:       ; %bb.0: ; %entry
2134; GFX10-CU-NEXT:    s_clause 0x1
2135; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2136; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2137; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2139; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2140; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2141; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2142; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2143; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2144; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2145; GFX10-CU-NEXT:    s_endpgm
2146;
2147; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2148; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2149; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2150; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2151; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2152; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2153; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2154; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2155; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2156; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2157; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2158; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2159; SKIP-CACHE-INV-NEXT:    s_endpgm
2160;
2161; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2162; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2163; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2164; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2165; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2166; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2167; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2168; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2169; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2170;
2171; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
2172; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2173; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2174; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2175; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2176; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2177; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2178; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2179; GFX90A-TGSPLIT-NEXT:    s_endpgm
2180    i32* %out, i32 %in, i32 %old) {
2181entry:
2182  %gep = getelementptr i32, i32* %out, i32 4
2183  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
2184  ret void
2185}
2186
2187define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
2188; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2189; GFX7:       ; %bb.0: ; %entry
2190; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2191; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2192; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2193; GFX7-NEXT:    s_add_u32 s0, s0, 16
2194; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2195; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2196; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2197; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2198; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2199; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2200; GFX7-NEXT:    s_endpgm
2201;
2202; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2203; GFX10-WGP:       ; %bb.0: ; %entry
2204; GFX10-WGP-NEXT:    s_clause 0x1
2205; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2206; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2207; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2209; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2210; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2211; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2212; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2213; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2214; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2215; GFX10-WGP-NEXT:    s_endpgm
2216;
2217; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2218; GFX10-CU:       ; %bb.0: ; %entry
2219; GFX10-CU-NEXT:    s_clause 0x1
2220; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2221; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2222; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2223; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2224; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2225; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2226; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2227; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2228; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2229; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2230; GFX10-CU-NEXT:    s_endpgm
2231;
2232; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2233; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2234; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2235; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2236; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2237; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2238; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2239; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2240; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2241; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2242; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2243; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2244; SKIP-CACHE-INV-NEXT:    s_endpgm
2245;
2246; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2247; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2248; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2249; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2250; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2252; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2253; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2254; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2255;
2256; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
2257; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2258; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2259; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2260; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2262; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2263; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2264; GFX90A-TGSPLIT-NEXT:    s_endpgm
2265    i32* %out, i32 %in, i32 %old) {
2266entry:
2267  %gep = getelementptr i32, i32* %out, i32 4
2268  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
2269  ret void
2270}
2271
2272define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
2273; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2274; GFX7:       ; %bb.0: ; %entry
2275; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2276; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2277; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2278; GFX7-NEXT:    s_add_u32 s0, s0, 16
2279; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2280; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2281; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2282; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2283; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2284; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2285; GFX7-NEXT:    s_endpgm
2286;
2287; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2288; GFX10-WGP:       ; %bb.0: ; %entry
2289; GFX10-WGP-NEXT:    s_clause 0x1
2290; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2291; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2292; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2293; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2294; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2295; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2296; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2297; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2298; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2299; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2300; GFX10-WGP-NEXT:    s_endpgm
2301;
2302; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2303; GFX10-CU:       ; %bb.0: ; %entry
2304; GFX10-CU-NEXT:    s_clause 0x1
2305; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2306; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2307; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2308; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2309; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2310; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2311; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2312; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2313; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2314; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2315; GFX10-CU-NEXT:    s_endpgm
2316;
2317; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2318; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2319; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2321; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2322; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2323; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2328; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2329; SKIP-CACHE-INV-NEXT:    s_endpgm
2330;
2331; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2332; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2333; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2334; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2335; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2336; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2337; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2338; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2339; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2340;
2341; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
2342; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2343; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2344; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2345; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2347; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2348; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2349; GFX90A-TGSPLIT-NEXT:    s_endpgm
2350    i32* %out, i32 %in, i32 %old) {
2351entry:
2352  %gep = getelementptr i32, i32* %out, i32 4
2353  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
2354  ret void
2355}
2356
2357define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
2358; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2359; GFX7:       ; %bb.0: ; %entry
2360; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2361; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2362; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2363; GFX7-NEXT:    s_add_u32 s0, s0, 16
2364; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2365; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2366; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2367; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2368; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2369; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2370; GFX7-NEXT:    s_endpgm
2371;
2372; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2373; GFX10-WGP:       ; %bb.0: ; %entry
2374; GFX10-WGP-NEXT:    s_clause 0x1
2375; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2376; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2377; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2378; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2379; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2380; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2381; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2382; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2383; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2384; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2385; GFX10-WGP-NEXT:    s_endpgm
2386;
2387; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2388; GFX10-CU:       ; %bb.0: ; %entry
2389; GFX10-CU-NEXT:    s_clause 0x1
2390; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2391; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2392; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2393; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2394; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2395; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2396; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2397; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2398; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2399; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2400; GFX10-CU-NEXT:    s_endpgm
2401;
2402; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2403; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2404; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2405; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2406; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2407; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2408; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2409; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2410; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2411; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2412; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2413; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2414; SKIP-CACHE-INV-NEXT:    s_endpgm
2415;
2416; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2417; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2418; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2419; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2420; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2421; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2422; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2423; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2424; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2425;
2426; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
2427; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2428; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2429; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2430; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2432; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2433; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2434; GFX90A-TGSPLIT-NEXT:    s_endpgm
2435    i32* %out, i32 %in, i32 %old) {
2436entry:
2437  %gep = getelementptr i32, i32* %out, i32 4
2438  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
2439  ret void
2440}
2441
2442define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
2443; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2444; GFX7:       ; %bb.0: ; %entry
2445; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2446; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2447; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2448; GFX7-NEXT:    s_add_u32 s0, s0, 16
2449; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2450; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2451; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2452; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2453; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2454; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2455; GFX7-NEXT:    s_endpgm
2456;
2457; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2458; GFX10-WGP:       ; %bb.0: ; %entry
2459; GFX10-WGP-NEXT:    s_clause 0x1
2460; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2461; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2462; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2464; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2465; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2466; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2467; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2468; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2469; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2470; GFX10-WGP-NEXT:    s_endpgm
2471;
2472; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2473; GFX10-CU:       ; %bb.0: ; %entry
2474; GFX10-CU-NEXT:    s_clause 0x1
2475; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2476; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2477; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2479; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2480; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2481; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2482; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2483; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2484; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2485; GFX10-CU-NEXT:    s_endpgm
2486;
2487; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2488; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2489; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2490; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2491; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2492; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2493; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2498; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2499; SKIP-CACHE-INV-NEXT:    s_endpgm
2500;
2501; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2502; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2503; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2505; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2507; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2508; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2509; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2510;
2511; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
2512; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2513; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2514; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2515; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2516; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2517; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2518; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2519; GFX90A-TGSPLIT-NEXT:    s_endpgm
2520    i32* %out, i32 %in, i32 %old) {
2521entry:
2522  %gep = getelementptr i32, i32* %out, i32 4
2523  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
2524  ret void
2525}
2526
2527define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
2528; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2529; GFX7:       ; %bb.0: ; %entry
2530; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2531; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2532; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX7-NEXT:    s_add_u32 s4, s0, 16
2534; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2535; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2536; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2537; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2538; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2539; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2540; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2541; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2542; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2543; GFX7-NEXT:    flat_store_dword v[0:1], v2
2544; GFX7-NEXT:    s_endpgm
2545;
2546; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2547; GFX10-WGP:       ; %bb.0: ; %entry
2548; GFX10-WGP-NEXT:    s_clause 0x1
2549; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2550; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2551; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2552; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2553; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2554; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2555; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2556; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2557; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2558; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2559; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2560; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2561; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2562; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2563; GFX10-WGP-NEXT:    s_endpgm
2564;
2565; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2566; GFX10-CU:       ; %bb.0: ; %entry
2567; GFX10-CU-NEXT:    s_clause 0x1
2568; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2569; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2570; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2571; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2572; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2573; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2574; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2575; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2576; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2577; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2578; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2579; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2580; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2581; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2582; GFX10-CU-NEXT:    s_endpgm
2583;
2584; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2585; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2586; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2587; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2588; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2589; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2590; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2592; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2593; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2595; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2598; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2599; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2600; SKIP-CACHE-INV-NEXT:    s_endpgm
2601;
2602; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2603; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2604; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2605; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2606; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2607; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2608; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2609; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2610; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2611; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2612; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2613;
2614; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
2615; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2616; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2617; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2618; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2619; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2620; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2621; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2622; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2623; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2624; GFX90A-TGSPLIT-NEXT:    s_endpgm
2625    i32* %out, i32 %in, i32 %old) {
2626entry:
2627  %gep = getelementptr i32, i32* %out, i32 4
2628  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
2629  %val0 = extractvalue { i32, i1 } %val, 0
2630  store i32 %val0, i32* %out, align 4
2631  ret void
2632}
2633
2634define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
2635; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2636; GFX7:       ; %bb.0: ; %entry
2637; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2638; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2639; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2640; GFX7-NEXT:    s_add_u32 s4, s0, 16
2641; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2642; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2643; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2644; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2645; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2646; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2647; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2648; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2649; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2650; GFX7-NEXT:    flat_store_dword v[0:1], v2
2651; GFX7-NEXT:    s_endpgm
2652;
2653; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2654; GFX10-WGP:       ; %bb.0: ; %entry
2655; GFX10-WGP-NEXT:    s_clause 0x1
2656; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2657; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2658; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2660; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2661; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2662; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2663; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2664; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2665; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2666; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2667; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2668; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2669; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2670; GFX10-WGP-NEXT:    s_endpgm
2671;
2672; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2673; GFX10-CU:       ; %bb.0: ; %entry
2674; GFX10-CU-NEXT:    s_clause 0x1
2675; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2676; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2677; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2678; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2679; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2680; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2681; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2682; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2683; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2684; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2685; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2686; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2687; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2688; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2689; GFX10-CU-NEXT:    s_endpgm
2690;
2691; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2692; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2693; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2694; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2695; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2696; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2697; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2698; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2699; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2700; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2701; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2702; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2703; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2705; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2706; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2707; SKIP-CACHE-INV-NEXT:    s_endpgm
2708;
2709; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2710; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2711; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2712; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2713; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2714; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2715; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2716; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2717; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2718; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2719; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2720;
2721; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
2722; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2723; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2724; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2725; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2727; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2728; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2729; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2730; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2731; GFX90A-TGSPLIT-NEXT:    s_endpgm
2732    i32* %out, i32 %in, i32 %old) {
2733entry:
2734  %gep = getelementptr i32, i32* %out, i32 4
2735  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
2736  %val0 = extractvalue { i32, i1 } %val, 0
2737  store i32 %val0, i32* %out, align 4
2738  ret void
2739}
2740
2741define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
2742; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2743; GFX7:       ; %bb.0: ; %entry
2744; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2745; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2746; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2747; GFX7-NEXT:    s_add_u32 s4, s0, 16
2748; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2749; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2750; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2751; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2752; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2753; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2754; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2755; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2756; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2757; GFX7-NEXT:    flat_store_dword v[0:1], v2
2758; GFX7-NEXT:    s_endpgm
2759;
2760; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2761; GFX10-WGP:       ; %bb.0: ; %entry
2762; GFX10-WGP-NEXT:    s_clause 0x1
2763; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2764; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2765; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2767; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2768; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2769; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2770; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2771; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2772; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2773; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2774; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2775; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2776; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2777; GFX10-WGP-NEXT:    s_endpgm
2778;
2779; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2780; GFX10-CU:       ; %bb.0: ; %entry
2781; GFX10-CU-NEXT:    s_clause 0x1
2782; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2783; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2784; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2785; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2786; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2787; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2788; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2789; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2790; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2791; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2792; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2793; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2794; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2795; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2796; GFX10-CU-NEXT:    s_endpgm
2797;
2798; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2799; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2800; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2801; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2802; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2803; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2804; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2806; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2809; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2811; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2812; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2813; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2814; SKIP-CACHE-INV-NEXT:    s_endpgm
2815;
2816; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2817; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2818; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2819; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2820; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2821; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2822; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2823; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2824; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2825; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2826; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2827;
2828; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
2829; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2830; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2831; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2832; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2833; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2834; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2835; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2836; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2837; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2838; GFX90A-TGSPLIT-NEXT:    s_endpgm
2839    i32* %out, i32 %in, i32 %old) {
2840entry:
2841  %gep = getelementptr i32, i32* %out, i32 4
2842  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
2843  %val0 = extractvalue { i32, i1 } %val, 0
2844  store i32 %val0, i32* %out, align 4
2845  ret void
2846}
2847
2848define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
2849; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2850; GFX7:       ; %bb.0: ; %entry
2851; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2852; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2853; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2854; GFX7-NEXT:    s_add_u32 s4, s0, 16
2855; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2856; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2857; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2858; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2859; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2860; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2861; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2862; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2863; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2864; GFX7-NEXT:    flat_store_dword v[0:1], v2
2865; GFX7-NEXT:    s_endpgm
2866;
2867; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2868; GFX10-WGP:       ; %bb.0: ; %entry
2869; GFX10-WGP-NEXT:    s_clause 0x1
2870; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2871; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2872; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2874; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2875; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2876; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2877; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2878; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2879; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2880; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2881; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2882; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2883; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2884; GFX10-WGP-NEXT:    s_endpgm
2885;
2886; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2887; GFX10-CU:       ; %bb.0: ; %entry
2888; GFX10-CU-NEXT:    s_clause 0x1
2889; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2890; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2891; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2892; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2893; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2895; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2896; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2897; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2898; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2900; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2901; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2902; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2903; GFX10-CU-NEXT:    s_endpgm
2904;
2905; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2906; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2907; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2908; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2909; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2910; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2911; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2913; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2916; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2919; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2920; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2921; SKIP-CACHE-INV-NEXT:    s_endpgm
2922;
2923; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2924; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2925; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2926; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2927; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2928; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2929; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2930; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2931; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2932; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2933; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2934;
2935; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
2936; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2937; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2938; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2939; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2940; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2941; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2942; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2943; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2944; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2945; GFX90A-TGSPLIT-NEXT:    s_endpgm
2946    i32* %out, i32 %in, i32 %old) {
2947entry:
2948  %gep = getelementptr i32, i32* %out, i32 4
2949  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
2950  %val0 = extractvalue { i32, i1 } %val, 0
2951  store i32 %val0, i32* %out, align 4
2952  ret void
2953}
2954
2955define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
2956; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
2957; GFX7:       ; %bb.0: ; %entry
2958; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2959; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2960; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2961; GFX7-NEXT:    s_add_u32 s4, s0, 16
2962; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2963; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2964; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2965; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2966; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2967; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2968; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2969; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2970; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2971; GFX7-NEXT:    flat_store_dword v[0:1], v2
2972; GFX7-NEXT:    s_endpgm
2973;
2974; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
2975; GFX10-WGP:       ; %bb.0: ; %entry
2976; GFX10-WGP-NEXT:    s_clause 0x1
2977; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2978; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2979; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2981; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2982; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2983; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2984; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2985; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2986; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2987; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2988; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2989; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2990; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2991; GFX10-WGP-NEXT:    s_endpgm
2992;
2993; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
2994; GFX10-CU:       ; %bb.0: ; %entry
2995; GFX10-CU-NEXT:    s_clause 0x1
2996; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2997; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2998; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2999; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3000; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3001; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3002; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3003; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3004; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3005; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3006; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3007; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3008; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3009; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3010; GFX10-CU-NEXT:    s_endpgm
3011;
3012; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
3013; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3014; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3015; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3016; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3017; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3018; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3019; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3020; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3021; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3022; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3023; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3024; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3026; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3027; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3028; SKIP-CACHE-INV-NEXT:    s_endpgm
3029;
3030; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
3031; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3032; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3033; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3034; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3035; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3036; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3037; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3038; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3039; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3040; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3041;
3042; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
3043; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3044; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3045; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3046; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3047; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3048; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3049; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3050; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3051; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3052; GFX90A-TGSPLIT-NEXT:    s_endpgm
3053    i32* %out, i32 %in, i32 %old) {
3054entry:
3055  %gep = getelementptr i32, i32* %out, i32 4
3056  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
3057  %val0 = extractvalue { i32, i1 } %val, 0
3058  store i32 %val0, i32* %out, align 4
3059  ret void
3060}
3061
3062define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
3063; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3064; GFX7:       ; %bb.0: ; %entry
3065; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3066; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3068; GFX7-NEXT:    s_add_u32 s4, s0, 16
3069; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3070; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3071; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3072; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3073; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3074; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3075; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3076; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3077; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3078; GFX7-NEXT:    flat_store_dword v[0:1], v2
3079; GFX7-NEXT:    s_endpgm
3080;
3081; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3082; GFX10-WGP:       ; %bb.0: ; %entry
3083; GFX10-WGP-NEXT:    s_clause 0x1
3084; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3085; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3086; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3087; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3088; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3089; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3090; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3091; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3092; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3093; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3094; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3095; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3096; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3097; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3098; GFX10-WGP-NEXT:    s_endpgm
3099;
3100; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3101; GFX10-CU:       ; %bb.0: ; %entry
3102; GFX10-CU-NEXT:    s_clause 0x1
3103; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3104; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3105; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3107; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3108; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3109; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3110; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3111; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3112; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3113; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3114; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3115; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3116; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3117; GFX10-CU-NEXT:    s_endpgm
3118;
3119; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3120; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3121; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3122; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3123; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3124; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3125; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3126; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3127; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3128; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3129; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3130; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3131; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3132; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3133; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3134; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3135; SKIP-CACHE-INV-NEXT:    s_endpgm
3136;
3137; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3138; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3139; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3140; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3141; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3143; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3144; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3145; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3146; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3147; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3148;
3149; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
3150; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3151; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3152; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3153; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3155; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3156; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3157; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3158; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3159; GFX90A-TGSPLIT-NEXT:    s_endpgm
3160    i32* %out, i32 %in, i32 %old) {
3161entry:
3162  %gep = getelementptr i32, i32* %out, i32 4
3163  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
3164  %val0 = extractvalue { i32, i1 } %val, 0
3165  store i32 %val0, i32* %out, align 4
3166  ret void
3167}
3168
3169define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
3170; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3171; GFX7:       ; %bb.0: ; %entry
3172; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3173; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3175; GFX7-NEXT:    s_add_u32 s4, s0, 16
3176; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3177; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3178; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3179; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3180; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3181; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3182; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3183; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3184; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3185; GFX7-NEXT:    flat_store_dword v[0:1], v2
3186; GFX7-NEXT:    s_endpgm
3187;
3188; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3189; GFX10-WGP:       ; %bb.0: ; %entry
3190; GFX10-WGP-NEXT:    s_clause 0x1
3191; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3192; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3193; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3194; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3195; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3196; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3197; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3199; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3200; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3201; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3202; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3203; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3204; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3205; GFX10-WGP-NEXT:    s_endpgm
3206;
3207; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3208; GFX10-CU:       ; %bb.0: ; %entry
3209; GFX10-CU-NEXT:    s_clause 0x1
3210; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3211; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3212; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3213; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3214; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3215; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3216; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3217; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3218; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3219; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3220; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3221; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3222; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3223; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3224; GFX10-CU-NEXT:    s_endpgm
3225;
3226; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3227; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3229; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3230; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3231; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3232; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3236; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3237; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3238; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3239; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3240; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3241; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3242; SKIP-CACHE-INV-NEXT:    s_endpgm
3243;
3244; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3245; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3246; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3247; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3248; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3249; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3250; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3251; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3252; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3253; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3254; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3255;
3256; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
3257; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3258; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3259; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3260; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3261; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3262; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3263; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3264; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3265; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3266; GFX90A-TGSPLIT-NEXT:    s_endpgm
3267    i32* %out, i32 %in, i32 %old) {
3268entry:
3269  %gep = getelementptr i32, i32* %out, i32 4
3270  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
3271  %val0 = extractvalue { i32, i1 } %val, 0
3272  store i32 %val0, i32* %out, align 4
3273  ret void
3274}
3275
3276define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
3277; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3278; GFX7:       ; %bb.0: ; %entry
3279; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3280; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3281; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3282; GFX7-NEXT:    s_add_u32 s4, s0, 16
3283; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3284; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3285; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3286; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3287; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3288; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3289; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3290; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3291; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3292; GFX7-NEXT:    flat_store_dword v[0:1], v2
3293; GFX7-NEXT:    s_endpgm
3294;
3295; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3296; GFX10-WGP:       ; %bb.0: ; %entry
3297; GFX10-WGP-NEXT:    s_clause 0x1
3298; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3299; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3300; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3301; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3302; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3303; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3304; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3305; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3306; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3307; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3308; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3310; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3311; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3312; GFX10-WGP-NEXT:    s_endpgm
3313;
3314; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3315; GFX10-CU:       ; %bb.0: ; %entry
3316; GFX10-CU-NEXT:    s_clause 0x1
3317; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3318; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3319; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3320; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3321; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3322; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3323; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3324; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3325; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3326; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3327; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3328; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3329; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3330; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3331; GFX10-CU-NEXT:    s_endpgm
3332;
3333; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3334; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3335; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3336; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3337; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3338; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3339; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3340; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3341; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3342; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3343; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3344; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3345; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3347; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3348; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3349; SKIP-CACHE-INV-NEXT:    s_endpgm
3350;
3351; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3352; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3353; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3354; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3355; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3356; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3357; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3358; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3359; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3360; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3361; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3362;
3363; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
3364; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3365; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3366; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3367; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3368; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3369; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3370; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3371; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3372; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3373; GFX90A-TGSPLIT-NEXT:    s_endpgm
3374    i32* %out, i32 %in, i32 %old) {
3375entry:
3376  %gep = getelementptr i32, i32* %out, i32 4
3377  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
3378  %val0 = extractvalue { i32, i1 } %val, 0
3379  store i32 %val0, i32* %out, align 4
3380  ret void
3381}
3382
3383define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
3384; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3385; GFX7:       ; %bb.0: ; %entry
3386; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3387; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3388; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3389; GFX7-NEXT:    s_add_u32 s4, s0, 16
3390; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3391; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3392; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3393; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3394; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3395; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3396; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3397; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3398; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3399; GFX7-NEXT:    flat_store_dword v[0:1], v2
3400; GFX7-NEXT:    s_endpgm
3401;
3402; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3403; GFX10-WGP:       ; %bb.0: ; %entry
3404; GFX10-WGP-NEXT:    s_clause 0x1
3405; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3406; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3407; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3408; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3409; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3410; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3411; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3412; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3413; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3414; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3415; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3416; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3417; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3418; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3419; GFX10-WGP-NEXT:    s_endpgm
3420;
3421; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3422; GFX10-CU:       ; %bb.0: ; %entry
3423; GFX10-CU-NEXT:    s_clause 0x1
3424; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3425; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3426; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3427; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3428; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3429; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3430; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3431; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3432; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3433; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3434; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3436; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3437; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3438; GFX10-CU-NEXT:    s_endpgm
3439;
3440; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3444; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3445; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3446; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3450; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3451; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3452; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3454; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3455; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3456; SKIP-CACHE-INV-NEXT:    s_endpgm
3457;
3458; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3459; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3460; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3461; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3462; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3464; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3465; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3466; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3467; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3468; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3469;
3470; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
3471; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3472; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3473; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3474; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3475; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3476; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3477; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3478; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3479; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3480; GFX90A-TGSPLIT-NEXT:    s_endpgm
3481    i32* %out, i32 %in, i32 %old) {
3482entry:
3483  %gep = getelementptr i32, i32* %out, i32 4
3484  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
3485  %val0 = extractvalue { i32, i1 } %val, 0
3486  store i32 %val0, i32* %out, align 4
3487  ret void
3488}
3489
3490define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
3491; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3492; GFX7:       ; %bb.0: ; %entry
3493; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3494; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3495; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3496; GFX7-NEXT:    s_add_u32 s4, s0, 16
3497; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3498; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3499; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3500; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3501; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3502; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3503; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3504; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3505; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3506; GFX7-NEXT:    flat_store_dword v[0:1], v2
3507; GFX7-NEXT:    s_endpgm
3508;
3509; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3510; GFX10-WGP:       ; %bb.0: ; %entry
3511; GFX10-WGP-NEXT:    s_clause 0x1
3512; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3513; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3514; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3515; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3516; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3517; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3518; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3519; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3520; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3521; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3522; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3523; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3524; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3525; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3526; GFX10-WGP-NEXT:    s_endpgm
3527;
3528; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3529; GFX10-CU:       ; %bb.0: ; %entry
3530; GFX10-CU-NEXT:    s_clause 0x1
3531; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3532; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3533; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3534; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3535; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3536; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3537; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3538; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3539; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3540; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3541; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3542; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3543; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3544; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3545; GFX10-CU-NEXT:    s_endpgm
3546;
3547; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3548; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3549; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3550; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3551; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3552; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3553; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3554; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3555; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3557; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3558; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3561; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3562; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3563; SKIP-CACHE-INV-NEXT:    s_endpgm
3564;
3565; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3566; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3567; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3568; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3569; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3570; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3571; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3572; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3573; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3574; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3575; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3576;
3577; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
3578; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3579; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3580; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3581; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3582; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3583; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3584; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3585; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3586; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3587; GFX90A-TGSPLIT-NEXT:    s_endpgm
3588    i32* %out, i32 %in, i32 %old) {
3589entry:
3590  %gep = getelementptr i32, i32* %out, i32 4
3591  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
3592  %val0 = extractvalue { i32, i1 } %val, 0
3593  store i32 %val0, i32* %out, align 4
3594  ret void
3595}
3596
3597define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
3598; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3599; GFX7:       ; %bb.0: ; %entry
3600; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3601; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3602; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3603; GFX7-NEXT:    s_add_u32 s4, s0, 16
3604; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3605; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3606; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3607; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3608; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3609; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3610; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3611; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3612; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3613; GFX7-NEXT:    flat_store_dword v[0:1], v2
3614; GFX7-NEXT:    s_endpgm
3615;
3616; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3617; GFX10-WGP:       ; %bb.0: ; %entry
3618; GFX10-WGP-NEXT:    s_clause 0x1
3619; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3620; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3621; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3622; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3623; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3624; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3625; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3626; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3627; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3628; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3629; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3630; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3631; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3632; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3633; GFX10-WGP-NEXT:    s_endpgm
3634;
3635; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3636; GFX10-CU:       ; %bb.0: ; %entry
3637; GFX10-CU-NEXT:    s_clause 0x1
3638; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3639; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3640; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3641; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3642; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3643; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3644; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3645; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3646; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3647; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3648; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3649; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3650; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3651; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3652; GFX10-CU-NEXT:    s_endpgm
3653;
3654; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3655; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3656; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3657; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3658; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3659; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3660; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3661; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3662; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3663; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3664; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3665; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3666; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3667; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3668; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3669; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3670; SKIP-CACHE-INV-NEXT:    s_endpgm
3671;
3672; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3673; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3674; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3675; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3676; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3677; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3678; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3679; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3680; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3681; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3682; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3683;
3684; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
3685; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3686; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3687; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3688; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3689; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3690; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3691; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3692; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3693; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3694; GFX90A-TGSPLIT-NEXT:    s_endpgm
3695    i32* %out, i32 %in, i32 %old) {
3696entry:
3697  %gep = getelementptr i32, i32* %out, i32 4
3698  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
3699  %val0 = extractvalue { i32, i1 } %val, 0
3700  store i32 %val0, i32* %out, align 4
3701  ret void
3702}
3703
3704define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
3705; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3706; GFX7:       ; %bb.0: ; %entry
3707; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3708; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3709; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3710; GFX7-NEXT:    s_add_u32 s4, s0, 16
3711; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3712; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3713; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3714; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3715; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3716; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3717; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3718; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3719; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3720; GFX7-NEXT:    flat_store_dword v[0:1], v2
3721; GFX7-NEXT:    s_endpgm
3722;
3723; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3724; GFX10-WGP:       ; %bb.0: ; %entry
3725; GFX10-WGP-NEXT:    s_clause 0x1
3726; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3727; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3728; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3729; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3730; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3731; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3732; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3733; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3734; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3735; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3736; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3737; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3738; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3739; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3740; GFX10-WGP-NEXT:    s_endpgm
3741;
3742; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3743; GFX10-CU:       ; %bb.0: ; %entry
3744; GFX10-CU-NEXT:    s_clause 0x1
3745; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3746; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3747; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3748; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3749; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3750; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3751; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3752; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3753; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3754; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3756; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3757; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3758; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3759; GFX10-CU-NEXT:    s_endpgm
3760;
3761; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3762; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3763; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3764; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3765; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3766; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3767; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3768; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3769; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3770; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3771; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3772; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3775; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3776; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3777; SKIP-CACHE-INV-NEXT:    s_endpgm
3778;
3779; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3780; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3781; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3782; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3783; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3784; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3785; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3786; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3787; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3788; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3789; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3790;
3791; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
3792; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3793; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3794; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3795; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3797; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3798; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3799; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3800; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3801; GFX90A-TGSPLIT-NEXT:    s_endpgm
3802    i32* %out, i32 %in, i32 %old) {
3803entry:
3804  %gep = getelementptr i32, i32* %out, i32 4
3805  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
3806  %val0 = extractvalue { i32, i1 } %val, 0
3807  store i32 %val0, i32* %out, align 4
3808  ret void
3809}
3810
3811define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
3812; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3813; GFX7:       ; %bb.0: ; %entry
3814; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3815; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3816; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3817; GFX7-NEXT:    s_add_u32 s4, s0, 16
3818; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3819; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3820; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3821; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3822; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3823; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3824; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3825; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3826; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3827; GFX7-NEXT:    flat_store_dword v[0:1], v2
3828; GFX7-NEXT:    s_endpgm
3829;
3830; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3831; GFX10-WGP:       ; %bb.0: ; %entry
3832; GFX10-WGP-NEXT:    s_clause 0x1
3833; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3834; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3835; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3836; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3837; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3838; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3839; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3840; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3841; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3842; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3843; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3844; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3845; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3846; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3847; GFX10-WGP-NEXT:    s_endpgm
3848;
3849; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3850; GFX10-CU:       ; %bb.0: ; %entry
3851; GFX10-CU-NEXT:    s_clause 0x1
3852; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3853; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3854; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3855; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3856; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3857; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3858; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3859; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3860; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3861; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3862; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3863; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3864; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3865; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3866; GFX10-CU-NEXT:    s_endpgm
3867;
3868; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3869; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3870; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3871; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3872; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3873; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3874; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3876; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3877; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3879; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3880; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3881; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3882; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3883; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3884; SKIP-CACHE-INV-NEXT:    s_endpgm
3885;
3886; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3887; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3888; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3889; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3890; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3891; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3892; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3893; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3894; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3895; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3896; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3897;
3898; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
3899; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3900; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3901; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3902; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3903; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3904; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3905; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3906; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3907; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3908; GFX90A-TGSPLIT-NEXT:    s_endpgm
3909    i32* %out, i32 %in, i32 %old) {
3910entry:
3911  %gep = getelementptr i32, i32* %out, i32 4
3912  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
3913  %val0 = extractvalue { i32, i1 } %val, 0
3914  store i32 %val0, i32* %out, align 4
3915  ret void
3916}
3917
3918define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
3919; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
3920; GFX7:       ; %bb.0: ; %entry
3921; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3922; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3923; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3924; GFX7-NEXT:    s_add_u32 s4, s0, 16
3925; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3926; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3927; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3928; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3929; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3930; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3931; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3932; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3933; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3934; GFX7-NEXT:    flat_store_dword v[0:1], v2
3935; GFX7-NEXT:    s_endpgm
3936;
3937; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
3938; GFX10-WGP:       ; %bb.0: ; %entry
3939; GFX10-WGP-NEXT:    s_clause 0x1
3940; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3941; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3942; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3943; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3944; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3945; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3946; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3947; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3948; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3949; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3950; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3951; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3952; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3953; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3954; GFX10-WGP-NEXT:    s_endpgm
3955;
3956; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
3957; GFX10-CU:       ; %bb.0: ; %entry
3958; GFX10-CU-NEXT:    s_clause 0x1
3959; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3960; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3961; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3962; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3963; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3964; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3965; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3966; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3967; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3968; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3969; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3970; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3971; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3972; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3973; GFX10-CU-NEXT:    s_endpgm
3974;
3975; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
3976; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3977; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3978; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3979; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3980; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3981; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3985; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3986; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3988; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3989; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3990; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3991; SKIP-CACHE-INV-NEXT:    s_endpgm
3992;
3993; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
3994; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3995; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3996; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3997; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3998; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3999; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4000; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4001; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4002; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4003; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4004;
4005; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
4006; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4007; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4008; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4009; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4010; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4011; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4012; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4013; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4014; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4015; GFX90A-TGSPLIT-NEXT:    s_endpgm
4016    i32* %out, i32 %in, i32 %old) {
4017entry:
4018  %gep = getelementptr i32, i32* %out, i32 4
4019  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
4020  %val0 = extractvalue { i32, i1 } %val, 0
4021  store i32 %val0, i32* %out, align 4
4022  ret void
4023}
4024
4025define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
4026; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4027; GFX7:       ; %bb.0: ; %entry
4028; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4029; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4030; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4031; GFX7-NEXT:    s_add_u32 s4, s0, 16
4032; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4033; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4034; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4035; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4036; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4037; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4038; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4039; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4040; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4041; GFX7-NEXT:    flat_store_dword v[0:1], v2
4042; GFX7-NEXT:    s_endpgm
4043;
4044; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4045; GFX10-WGP:       ; %bb.0: ; %entry
4046; GFX10-WGP-NEXT:    s_clause 0x1
4047; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4048; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4049; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4050; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4051; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4052; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4053; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4054; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4055; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4056; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4057; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4058; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4059; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4060; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4061; GFX10-WGP-NEXT:    s_endpgm
4062;
4063; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4064; GFX10-CU:       ; %bb.0: ; %entry
4065; GFX10-CU-NEXT:    s_clause 0x1
4066; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4067; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4068; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4069; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4070; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4071; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4072; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4073; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4074; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4075; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4076; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4077; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4078; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4079; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4080; GFX10-CU-NEXT:    s_endpgm
4081;
4082; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4083; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4084; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4085; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4086; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4087; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4088; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4092; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4093; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4095; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4096; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4097; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4098; SKIP-CACHE-INV-NEXT:    s_endpgm
4099;
4100; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4101; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4103; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4104; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4105; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4106; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4107; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4109; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4110; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4111;
4112; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
4113; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4114; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4115; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4116; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4117; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4118; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4119; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4120; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4121; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4122; GFX90A-TGSPLIT-NEXT:    s_endpgm
4123    i32* %out, i32 %in, i32 %old) {
4124entry:
4125  %gep = getelementptr i32, i32* %out, i32 4
4126  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
4127  %val0 = extractvalue { i32, i1 } %val, 0
4128  store i32 %val0, i32* %out, align 4
4129  ret void
4130}
4131
4132define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
4133; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
4134; GFX7:       ; %bb.0: ; %entry
4135; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4136; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4137; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4138; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4139; GFX7-NEXT:    flat_load_dword v0, v[0:1]
4140; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4141; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4142; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4143; GFX7-NEXT:    flat_store_dword v[2:3], v0
4144; GFX7-NEXT:    s_endpgm
4145;
4146; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
4147; GFX10-WGP:       ; %bb.0: ; %entry
4148; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4149; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4150; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4151; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4152; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
4153; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4154; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4155; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4156; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4157; GFX10-WGP-NEXT:    s_endpgm
4158;
4159; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
4160; GFX10-CU:       ; %bb.0: ; %entry
4161; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4162; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4163; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4164; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4165; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
4166; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4167; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4168; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4169; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4170; GFX10-CU-NEXT:    s_endpgm
4171;
4172; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load:
4173; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4174; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4175; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4176; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4177; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4178; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
4179; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4180; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4181; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4182; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
4183; SKIP-CACHE-INV-NEXT:    s_endpgm
4184;
4185; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
4186; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4187; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4188; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4189; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4190; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4191; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4192; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4193; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4194; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4195; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4196; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4197;
4198; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
4199; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4200; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4201; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4202; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4203; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4204; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4205; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4206; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4207; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4208; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4209; GFX90A-TGSPLIT-NEXT:    s_endpgm
4210    i32* %in, i32* %out) {
4211entry:
4212  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
4213  store i32 %val, i32* %out
4214  ret void
4215}
4216
4217define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
4218; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
4219; GFX7:       ; %bb.0: ; %entry
4220; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4221; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4222; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4223; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4224; GFX7-NEXT:    flat_load_dword v0, v[0:1]
4225; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4226; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4227; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4228; GFX7-NEXT:    flat_store_dword v[2:3], v0
4229; GFX7-NEXT:    s_endpgm
4230;
4231; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
4232; GFX10-WGP:       ; %bb.0: ; %entry
4233; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4234; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4235; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4236; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4237; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
4238; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4239; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4240; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4241; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4242; GFX10-WGP-NEXT:    s_endpgm
4243;
4244; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
4245; GFX10-CU:       ; %bb.0: ; %entry
4246; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4247; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4248; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4249; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4250; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
4251; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4252; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4253; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4254; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4255; GFX10-CU-NEXT:    s_endpgm
4256;
4257; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load:
4258; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4259; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4260; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4261; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4262; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4263; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
4264; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4265; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4266; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4267; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
4268; SKIP-CACHE-INV-NEXT:    s_endpgm
4269;
4270; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
4271; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4272; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4273; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4274; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4275; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4276; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4277; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4278; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4279; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4280; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4281; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4282;
4283; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
4284; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4285; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4286; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4287; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4288; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4289; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4290; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4291; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4292; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4293; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4294; GFX90A-TGSPLIT-NEXT:    s_endpgm
4295    i32* %in, i32* %out) {
4296entry:
4297  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
4298  store i32 %val, i32* %out
4299  ret void
4300}
4301
4302define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
4303; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
4304; GFX7:       ; %bb.0: ; %entry
4305; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4306; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4307; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4308; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4309; GFX7-NEXT:    flat_load_dword v0, v[0:1]
4310; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4311; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4312; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4313; GFX7-NEXT:    flat_store_dword v[2:3], v0
4314; GFX7-NEXT:    s_endpgm
4315;
4316; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
4317; GFX10-WGP:       ; %bb.0: ; %entry
4318; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4319; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4320; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4321; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4322; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
4323; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4324; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4325; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4326; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4327; GFX10-WGP-NEXT:    s_endpgm
4328;
4329; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
4330; GFX10-CU:       ; %bb.0: ; %entry
4331; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4332; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4334; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4335; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
4336; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4337; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4338; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4339; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4340; GFX10-CU-NEXT:    s_endpgm
4341;
4342; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load:
4343; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4344; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4345; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4347; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4348; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
4349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4351; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4352; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
4353; SKIP-CACHE-INV-NEXT:    s_endpgm
4354;
4355; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
4356; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4357; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4358; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4359; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4360; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4361; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4362; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4363; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4364; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4365; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4366; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4367;
4368; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
4369; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4370; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4371; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4372; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4373; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4374; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4375; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4376; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4377; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4378; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4379; GFX90A-TGSPLIT-NEXT:    s_endpgm
4380    i32* %in, i32* %out) {
4381entry:
4382  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
4383  store i32 %val, i32* %out
4384  ret void
4385}
4386
4387define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
4388; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
4389; GFX7:       ; %bb.0: ; %entry
4390; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4391; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4392; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4393; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4394; GFX7-NEXT:    flat_load_dword v0, v[0:1]
4395; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4396; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4397; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4398; GFX7-NEXT:    flat_store_dword v[2:3], v0
4399; GFX7-NEXT:    s_endpgm
4400;
4401; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
4402; GFX10-WGP:       ; %bb.0: ; %entry
4403; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4404; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4405; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4406; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4407; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
4408; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4409; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4410; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4411; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4412; GFX10-WGP-NEXT:    s_endpgm
4413;
4414; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
4415; GFX10-CU:       ; %bb.0: ; %entry
4416; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4417; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4418; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4419; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4420; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
4421; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4422; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4423; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4424; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4425; GFX10-CU-NEXT:    s_endpgm
4426;
4427; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load:
4428; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4429; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4430; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4432; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4433; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
4434; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4436; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4437; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
4438; SKIP-CACHE-INV-NEXT:    s_endpgm
4439;
4440; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
4441; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4442; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4443; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4445; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4446; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4447; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4448; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4449; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4450; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4451; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4452;
4453; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
4454; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4455; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4456; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4457; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4458; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4459; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4460; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4461; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4462; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4463; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4464; GFX90A-TGSPLIT-NEXT:    s_endpgm
4465    i32* %in, i32* %out) {
4466entry:
4467  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
4468  store i32 %val, i32* %out
4469  ret void
4470}
4471
4472define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
4473; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
4474; GFX7:       ; %bb.0: ; %entry
4475; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4476; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4477; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4478; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4479; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4480; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4481; GFX7-NEXT:    flat_store_dword v[0:1], v2
4482; GFX7-NEXT:    s_endpgm
4483;
4484; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
4485; GFX10-WGP:       ; %bb.0: ; %entry
4486; GFX10-WGP-NEXT:    s_clause 0x1
4487; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4488; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4489; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4490; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4491; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4492; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4493; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4494; GFX10-WGP-NEXT:    s_endpgm
4495;
4496; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
4497; GFX10-CU:       ; %bb.0: ; %entry
4498; GFX10-CU-NEXT:    s_clause 0x1
4499; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4500; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4501; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4502; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4503; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4504; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4505; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4506; GFX10-CU-NEXT:    s_endpgm
4507;
4508; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store:
4509; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4510; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
4511; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4512; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4516; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4517; SKIP-CACHE-INV-NEXT:    s_endpgm
4518;
4519; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
4520; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4521; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4522; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4523; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4524; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4525; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4526; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4527; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4528;
4529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
4530; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4531; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4532; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4533; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4534; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4535; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4536; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4537; GFX90A-TGSPLIT-NEXT:    s_endpgm
4538    i32 %in, i32* %out) {
4539entry:
4540  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
4541  ret void
4542}
4543
4544define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
4545; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
4546; GFX7:       ; %bb.0: ; %entry
4547; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4548; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4549; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4550; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4551; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4552; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4553; GFX7-NEXT:    flat_store_dword v[0:1], v2
4554; GFX7-NEXT:    s_endpgm
4555;
4556; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
4557; GFX10-WGP:       ; %bb.0: ; %entry
4558; GFX10-WGP-NEXT:    s_clause 0x1
4559; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4560; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4561; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4562; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4563; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4564; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4565; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4566; GFX10-WGP-NEXT:    s_endpgm
4567;
4568; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
4569; GFX10-CU:       ; %bb.0: ; %entry
4570; GFX10-CU-NEXT:    s_clause 0x1
4571; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4572; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4573; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4574; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4575; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4576; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4577; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4578; GFX10-CU-NEXT:    s_endpgm
4579;
4580; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store:
4581; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4582; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
4583; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4584; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4585; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4586; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4587; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4588; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4589; SKIP-CACHE-INV-NEXT:    s_endpgm
4590;
4591; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
4592; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4593; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4594; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4595; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4596; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4597; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4598; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4599; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4600;
4601; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
4602; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4603; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4604; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4605; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4606; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4607; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4608; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4609; GFX90A-TGSPLIT-NEXT:    s_endpgm
4610    i32 %in, i32* %out) {
4611entry:
4612  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
4613  ret void
4614}
4615
4616define amdgpu_kernel void @flat_singlethread_one_as_release_store(
4617; GFX7-LABEL: flat_singlethread_one_as_release_store:
4618; GFX7:       ; %bb.0: ; %entry
4619; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4620; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4621; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4623; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4624; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4625; GFX7-NEXT:    flat_store_dword v[0:1], v2
4626; GFX7-NEXT:    s_endpgm
4627;
4628; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
4629; GFX10-WGP:       ; %bb.0: ; %entry
4630; GFX10-WGP-NEXT:    s_clause 0x1
4631; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4632; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4633; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4634; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4635; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4636; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4637; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4638; GFX10-WGP-NEXT:    s_endpgm
4639;
4640; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
4641; GFX10-CU:       ; %bb.0: ; %entry
4642; GFX10-CU-NEXT:    s_clause 0x1
4643; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4644; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4645; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4646; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4647; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4648; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4649; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4650; GFX10-CU-NEXT:    s_endpgm
4651;
4652; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store:
4653; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4654; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
4655; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4656; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4657; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4658; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4659; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4660; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4661; SKIP-CACHE-INV-NEXT:    s_endpgm
4662;
4663; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
4664; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4665; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4666; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4667; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4668; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4669; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4670; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4671; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4672;
4673; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
4674; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4675; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4676; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4677; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4679; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4680; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4681; GFX90A-TGSPLIT-NEXT:    s_endpgm
4682    i32 %in, i32* %out) {
4683entry:
4684  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
4685  ret void
4686}
4687
4688define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
4689; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
4690; GFX7:       ; %bb.0: ; %entry
4691; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
4692; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
4693; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4694; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4695; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4696; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4697; GFX7-NEXT:    flat_store_dword v[0:1], v2
4698; GFX7-NEXT:    s_endpgm
4699;
4700; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
4701; GFX10-WGP:       ; %bb.0: ; %entry
4702; GFX10-WGP-NEXT:    s_clause 0x1
4703; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4704; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
4705; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4706; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4707; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4708; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4709; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4710; GFX10-WGP-NEXT:    s_endpgm
4711;
4712; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
4713; GFX10-CU:       ; %bb.0: ; %entry
4714; GFX10-CU-NEXT:    s_clause 0x1
4715; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4716; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
4717; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4718; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4719; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4720; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4721; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4722; GFX10-CU-NEXT:    s_endpgm
4723;
4724; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store:
4725; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4726; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
4727; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4728; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4729; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4730; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4731; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4732; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4733; SKIP-CACHE-INV-NEXT:    s_endpgm
4734;
4735; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
4736; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4737; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4738; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4739; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4740; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4741; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4742; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4743; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4744;
4745; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
4746; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4747; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
4748; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4749; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4751; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4752; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4753; GFX90A-TGSPLIT-NEXT:    s_endpgm
4754    i32 %in, i32* %out) {
4755entry:
4756  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
4757  ret void
4758}
4759
4760define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
4761; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4762; GFX7:       ; %bb.0: ; %entry
4763; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4764; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4765; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4766; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4767; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4768; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4769; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4770; GFX7-NEXT:    s_endpgm
4771;
4772; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4773; GFX10-WGP:       ; %bb.0: ; %entry
4774; GFX10-WGP-NEXT:    s_clause 0x1
4775; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4776; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4777; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4778; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4779; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4780; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4781; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
4782; GFX10-WGP-NEXT:    s_endpgm
4783;
4784; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4785; GFX10-CU:       ; %bb.0: ; %entry
4786; GFX10-CU-NEXT:    s_clause 0x1
4787; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4788; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4789; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4790; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4791; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4792; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4793; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
4794; GFX10-CU-NEXT:    s_endpgm
4795;
4796; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4797; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4798; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4799; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4804; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
4805; SKIP-CACHE-INV-NEXT:    s_endpgm
4806;
4807; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4808; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4809; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4810; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4811; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4812; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4813; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4814; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4815; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4816;
4817; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
4818; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4819; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4820; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4821; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4822; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4823; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4824; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4825; GFX90A-TGSPLIT-NEXT:    s_endpgm
4826    i32* %out, i32 %in) {
4827entry:
4828  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
4829  ret void
4830}
4831
4832define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
4833; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4834; GFX7:       ; %bb.0: ; %entry
4835; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4836; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4837; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4839; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4840; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4841; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4842; GFX7-NEXT:    s_endpgm
4843;
4844; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4845; GFX10-WGP:       ; %bb.0: ; %entry
4846; GFX10-WGP-NEXT:    s_clause 0x1
4847; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4848; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4849; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4850; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4851; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4852; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4853; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
4854; GFX10-WGP-NEXT:    s_endpgm
4855;
4856; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4857; GFX10-CU:       ; %bb.0: ; %entry
4858; GFX10-CU-NEXT:    s_clause 0x1
4859; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4860; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4861; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4862; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4863; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4864; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4865; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
4866; GFX10-CU-NEXT:    s_endpgm
4867;
4868; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4869; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4870; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4871; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4872; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4873; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4874; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4876; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
4877; SKIP-CACHE-INV-NEXT:    s_endpgm
4878;
4879; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4880; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4881; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4882; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4883; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4884; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4885; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4886; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4887; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4888;
4889; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
4890; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4891; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4892; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4893; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4894; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4895; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4896; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4897; GFX90A-TGSPLIT-NEXT:    s_endpgm
4898    i32* %out, i32 %in) {
4899entry:
4900  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
4901  ret void
4902}
4903
4904define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
4905; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
4906; GFX7:       ; %bb.0: ; %entry
4907; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4908; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4909; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4910; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4911; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4912; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4913; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4914; GFX7-NEXT:    s_endpgm
4915;
4916; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
4917; GFX10-WGP:       ; %bb.0: ; %entry
4918; GFX10-WGP-NEXT:    s_clause 0x1
4919; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4920; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4921; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4922; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4923; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4924; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4925; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
4926; GFX10-WGP-NEXT:    s_endpgm
4927;
4928; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
4929; GFX10-CU:       ; %bb.0: ; %entry
4930; GFX10-CU-NEXT:    s_clause 0x1
4931; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4932; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4933; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4934; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4935; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4936; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4937; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
4938; GFX10-CU-NEXT:    s_endpgm
4939;
4940; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw:
4941; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4942; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4943; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4944; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4945; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4946; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4947; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4948; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
4949; SKIP-CACHE-INV-NEXT:    s_endpgm
4950;
4951; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
4952; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4953; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4954; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4955; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4956; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4957; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4958; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4959; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4960;
4961; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
4962; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4963; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4964; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4965; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4966; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4967; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4968; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4969; GFX90A-TGSPLIT-NEXT:    s_endpgm
4970    i32* %out, i32 %in) {
4971entry:
4972  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
4973  ret void
4974}
4975
4976define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
4977; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
4978; GFX7:       ; %bb.0: ; %entry
4979; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4980; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4981; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4982; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4983; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4984; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4985; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
4986; GFX7-NEXT:    s_endpgm
4987;
4988; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
4989; GFX10-WGP:       ; %bb.0: ; %entry
4990; GFX10-WGP-NEXT:    s_clause 0x1
4991; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4992; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4993; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4994; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4995; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4996; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4997; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
4998; GFX10-WGP-NEXT:    s_endpgm
4999;
5000; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
5001; GFX10-CU:       ; %bb.0: ; %entry
5002; GFX10-CU-NEXT:    s_clause 0x1
5003; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5004; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5005; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5006; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5007; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5008; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5009; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5010; GFX10-CU-NEXT:    s_endpgm
5011;
5012; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
5013; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5014; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5015; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5016; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5017; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5018; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5019; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5020; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5021; SKIP-CACHE-INV-NEXT:    s_endpgm
5022;
5023; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
5024; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5025; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5026; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5028; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5029; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5030; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5031; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5032;
5033; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
5034; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5035; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5036; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5037; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5038; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5039; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5040; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5041; GFX90A-TGSPLIT-NEXT:    s_endpgm
5042    i32* %out, i32 %in) {
5043entry:
5044  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
5045  ret void
5046}
5047
5048define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
5049; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5050; GFX7:       ; %bb.0: ; %entry
5051; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5052; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5053; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5055; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5056; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5057; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5058; GFX7-NEXT:    s_endpgm
5059;
5060; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5061; GFX10-WGP:       ; %bb.0: ; %entry
5062; GFX10-WGP-NEXT:    s_clause 0x1
5063; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5064; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5065; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5066; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5067; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5068; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5069; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5070; GFX10-WGP-NEXT:    s_endpgm
5071;
5072; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5073; GFX10-CU:       ; %bb.0: ; %entry
5074; GFX10-CU-NEXT:    s_clause 0x1
5075; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5076; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5077; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5079; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5080; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5081; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5082; GFX10-CU-NEXT:    s_endpgm
5083;
5084; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5085; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5086; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5087; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5088; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5092; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5093; SKIP-CACHE-INV-NEXT:    s_endpgm
5094;
5095; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5096; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5097; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5098; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5099; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5100; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5101; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5102; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5103; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5104;
5105; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
5106; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5107; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5108; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5109; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5110; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5111; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5112; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5113; GFX90A-TGSPLIT-NEXT:    s_endpgm
5114    i32* %out, i32 %in) {
5115entry:
5116  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
5117  ret void
5118}
5119
5120define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
5121; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5122; GFX7:       ; %bb.0: ; %entry
5123; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5124; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5125; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5126; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5127; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5128; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5129; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5130; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5131; GFX7-NEXT:    flat_store_dword v[0:1], v2
5132; GFX7-NEXT:    s_endpgm
5133;
5134; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5135; GFX10-WGP:       ; %bb.0: ; %entry
5136; GFX10-WGP-NEXT:    s_clause 0x1
5137; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5138; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5139; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5140; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5141; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5142; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5143; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5144; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5145; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5146; GFX10-WGP-NEXT:    s_endpgm
5147;
5148; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5149; GFX10-CU:       ; %bb.0: ; %entry
5150; GFX10-CU-NEXT:    s_clause 0x1
5151; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5152; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5153; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5154; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5155; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5156; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5157; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5158; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5159; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5160; GFX10-CU-NEXT:    s_endpgm
5161;
5162; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5163; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5164; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5165; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5166; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5167; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5168; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5169; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5170; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5171; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5172; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5173; SKIP-CACHE-INV-NEXT:    s_endpgm
5174;
5175; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5176; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5177; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5178; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5179; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5180; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5181; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5182; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5183; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5184; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5185; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5186;
5187; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
5188; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5189; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5190; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5191; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5192; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5193; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5194; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5195; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5196; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5197; GFX90A-TGSPLIT-NEXT:    s_endpgm
5198    i32* %out, i32 %in) {
5199entry:
5200  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
5201  store i32 %val, i32* %out, align 4
5202  ret void
5203}
5204
5205define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
5206; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5207; GFX7:       ; %bb.0: ; %entry
5208; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5209; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5210; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5211; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5212; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5213; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5214; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5215; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5216; GFX7-NEXT:    flat_store_dword v[0:1], v2
5217; GFX7-NEXT:    s_endpgm
5218;
5219; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5220; GFX10-WGP:       ; %bb.0: ; %entry
5221; GFX10-WGP-NEXT:    s_clause 0x1
5222; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5223; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5224; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5225; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5226; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5227; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5228; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5229; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5230; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5231; GFX10-WGP-NEXT:    s_endpgm
5232;
5233; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5234; GFX10-CU:       ; %bb.0: ; %entry
5235; GFX10-CU-NEXT:    s_clause 0x1
5236; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5237; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5238; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5239; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5240; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5241; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5242; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5243; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5244; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5245; GFX10-CU-NEXT:    s_endpgm
5246;
5247; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5248; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5249; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5250; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5251; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5252; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5253; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5255; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5256; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5257; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5258; SKIP-CACHE-INV-NEXT:    s_endpgm
5259;
5260; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5261; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5262; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5263; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5264; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5265; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5266; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5267; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5268; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5269; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5270; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5271;
5272; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
5273; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5275; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5276; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5277; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5278; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5279; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5280; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5281; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5282; GFX90A-TGSPLIT-NEXT:    s_endpgm
5283    i32* %out, i32 %in) {
5284entry:
5285  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
5286  store i32 %val, i32* %out, align 4
5287  ret void
5288}
5289
5290define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
5291; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5292; GFX7:       ; %bb.0: ; %entry
5293; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5294; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5295; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5296; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5297; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5298; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5299; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5300; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5301; GFX7-NEXT:    flat_store_dword v[0:1], v2
5302; GFX7-NEXT:    s_endpgm
5303;
5304; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5305; GFX10-WGP:       ; %bb.0: ; %entry
5306; GFX10-WGP-NEXT:    s_clause 0x1
5307; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5308; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5309; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5310; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5311; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5312; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5313; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5314; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5315; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5316; GFX10-WGP-NEXT:    s_endpgm
5317;
5318; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5319; GFX10-CU:       ; %bb.0: ; %entry
5320; GFX10-CU-NEXT:    s_clause 0x1
5321; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5322; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5323; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5324; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5325; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5326; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5327; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5328; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5329; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5330; GFX10-CU-NEXT:    s_endpgm
5331;
5332; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5333; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5334; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5335; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5336; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5338; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5339; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5340; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5341; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5342; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5343; SKIP-CACHE-INV-NEXT:    s_endpgm
5344;
5345; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5346; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5347; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5348; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5349; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5350; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5351; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5352; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5353; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5354; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5355; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5356;
5357; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
5358; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5359; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5360; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5361; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5362; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5363; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5364; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5365; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5366; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5367; GFX90A-TGSPLIT-NEXT:    s_endpgm
5368    i32* %out, i32 %in) {
5369entry:
5370  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
5371  store i32 %val, i32* %out, align 4
5372  ret void
5373}
5374
5375define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
5376; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5377; GFX7:       ; %bb.0: ; %entry
5378; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5379; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5380; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5381; GFX7-NEXT:    s_add_u32 s0, s0, 16
5382; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5383; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5384; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5385; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5386; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5387; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5388; GFX7-NEXT:    s_endpgm
5389;
5390; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5391; GFX10-WGP:       ; %bb.0: ; %entry
5392; GFX10-WGP-NEXT:    s_clause 0x1
5393; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5394; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5395; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5396; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5397; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5398; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5399; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5400; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5401; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5402; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5403; GFX10-WGP-NEXT:    s_endpgm
5404;
5405; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5406; GFX10-CU:       ; %bb.0: ; %entry
5407; GFX10-CU-NEXT:    s_clause 0x1
5408; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5409; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5410; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5411; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5412; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5413; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5414; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5415; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5416; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5417; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5418; GFX10-CU-NEXT:    s_endpgm
5419;
5420; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5421; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5422; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5423; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5424; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5425; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5426; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5427; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5428; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5429; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5431; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5432; SKIP-CACHE-INV-NEXT:    s_endpgm
5433;
5434; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5435; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5436; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5437; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5438; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5439; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5440; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5441; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5442; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5443;
5444; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
5445; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5446; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5447; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5448; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5449; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5450; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5451; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5452; GFX90A-TGSPLIT-NEXT:    s_endpgm
5453    i32* %out, i32 %in, i32 %old) {
5454entry:
5455  %gep = getelementptr i32, i32* %out, i32 4
5456  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
5457  ret void
5458}
5459
5460define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
5461; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5462; GFX7:       ; %bb.0: ; %entry
5463; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5464; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5465; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5466; GFX7-NEXT:    s_add_u32 s0, s0, 16
5467; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5468; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5469; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5470; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5471; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5472; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5473; GFX7-NEXT:    s_endpgm
5474;
5475; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5476; GFX10-WGP:       ; %bb.0: ; %entry
5477; GFX10-WGP-NEXT:    s_clause 0x1
5478; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5479; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5480; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5481; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5482; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5483; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5484; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5485; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5486; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5487; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5488; GFX10-WGP-NEXT:    s_endpgm
5489;
5490; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5491; GFX10-CU:       ; %bb.0: ; %entry
5492; GFX10-CU-NEXT:    s_clause 0x1
5493; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5494; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5495; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5496; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5497; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5498; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5499; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5500; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5501; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5502; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5503; GFX10-CU-NEXT:    s_endpgm
5504;
5505; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5506; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5507; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5508; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5509; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5510; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5511; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5512; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5516; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5517; SKIP-CACHE-INV-NEXT:    s_endpgm
5518;
5519; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5520; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5521; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5522; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5523; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5524; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5525; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5526; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5527; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5528;
5529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
5530; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5531; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5532; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5533; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5534; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5535; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5536; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5537; GFX90A-TGSPLIT-NEXT:    s_endpgm
5538    i32* %out, i32 %in, i32 %old) {
5539entry:
5540  %gep = getelementptr i32, i32* %out, i32 4
5541  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
5542  ret void
5543}
5544
5545define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
5546; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5547; GFX7:       ; %bb.0: ; %entry
5548; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5549; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5550; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5551; GFX7-NEXT:    s_add_u32 s0, s0, 16
5552; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5553; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5554; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5555; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5556; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5557; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5558; GFX7-NEXT:    s_endpgm
5559;
5560; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5561; GFX10-WGP:       ; %bb.0: ; %entry
5562; GFX10-WGP-NEXT:    s_clause 0x1
5563; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5564; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5566; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5567; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5568; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5569; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5570; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5571; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5572; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5573; GFX10-WGP-NEXT:    s_endpgm
5574;
5575; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5576; GFX10-CU:       ; %bb.0: ; %entry
5577; GFX10-CU-NEXT:    s_clause 0x1
5578; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5579; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5580; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5581; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5582; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5583; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5584; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5585; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5586; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5587; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5588; GFX10-CU-NEXT:    s_endpgm
5589;
5590; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5591; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5592; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5593; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5594; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5595; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5596; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5598; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5599; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5600; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5601; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5602; SKIP-CACHE-INV-NEXT:    s_endpgm
5603;
5604; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5605; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5606; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5607; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5608; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5609; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5610; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5611; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5612; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5613;
5614; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
5615; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5616; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5617; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5618; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5619; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5620; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5621; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5622; GFX90A-TGSPLIT-NEXT:    s_endpgm
5623    i32* %out, i32 %in, i32 %old) {
5624entry:
5625  %gep = getelementptr i32, i32* %out, i32 4
5626  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
5627  ret void
5628}
5629
5630define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
5631; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5632; GFX7:       ; %bb.0: ; %entry
5633; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5634; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5635; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5636; GFX7-NEXT:    s_add_u32 s0, s0, 16
5637; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5638; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5639; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5640; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5641; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5642; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5643; GFX7-NEXT:    s_endpgm
5644;
5645; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5646; GFX10-WGP:       ; %bb.0: ; %entry
5647; GFX10-WGP-NEXT:    s_clause 0x1
5648; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5649; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5650; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5651; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5652; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5653; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5654; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5655; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5656; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5657; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5658; GFX10-WGP-NEXT:    s_endpgm
5659;
5660; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5661; GFX10-CU:       ; %bb.0: ; %entry
5662; GFX10-CU-NEXT:    s_clause 0x1
5663; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5664; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5665; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5666; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5667; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5668; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5669; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5670; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5671; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5672; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5673; GFX10-CU-NEXT:    s_endpgm
5674;
5675; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5676; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5677; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5678; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5679; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5680; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5681; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5686; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5687; SKIP-CACHE-INV-NEXT:    s_endpgm
5688;
5689; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5690; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5691; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5692; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5693; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5694; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5695; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5696; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5697; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5698;
5699; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
5700; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5701; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5702; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5703; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5704; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5705; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5706; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5707; GFX90A-TGSPLIT-NEXT:    s_endpgm
5708    i32* %out, i32 %in, i32 %old) {
5709entry:
5710  %gep = getelementptr i32, i32* %out, i32 4
5711  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
5712  ret void
5713}
5714
5715define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
5716; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5717; GFX7:       ; %bb.0: ; %entry
5718; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5719; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5720; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5721; GFX7-NEXT:    s_add_u32 s0, s0, 16
5722; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5723; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5724; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5725; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5726; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5727; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5728; GFX7-NEXT:    s_endpgm
5729;
5730; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5731; GFX10-WGP:       ; %bb.0: ; %entry
5732; GFX10-WGP-NEXT:    s_clause 0x1
5733; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5734; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5735; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5736; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5737; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5738; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5739; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5740; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5741; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5742; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5743; GFX10-WGP-NEXT:    s_endpgm
5744;
5745; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5746; GFX10-CU:       ; %bb.0: ; %entry
5747; GFX10-CU-NEXT:    s_clause 0x1
5748; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5749; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5750; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5751; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5752; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5753; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5754; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5755; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5756; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5757; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5758; GFX10-CU-NEXT:    s_endpgm
5759;
5760; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5761; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5762; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5763; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5764; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5765; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5766; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5767; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5768; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5769; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5770; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5771; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5772; SKIP-CACHE-INV-NEXT:    s_endpgm
5773;
5774; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5775; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5776; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5777; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5778; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5779; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5780; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5781; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5782; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5783;
5784; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
5785; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5786; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5787; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5788; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5789; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5790; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5791; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5792; GFX90A-TGSPLIT-NEXT:    s_endpgm
5793    i32* %out, i32 %in, i32 %old) {
5794entry:
5795  %gep = getelementptr i32, i32* %out, i32 4
5796  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
5797  ret void
5798}
5799
5800define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
5801; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5802; GFX7:       ; %bb.0: ; %entry
5803; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5804; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5805; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5806; GFX7-NEXT:    s_add_u32 s0, s0, 16
5807; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5808; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5809; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5810; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5811; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5812; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5813; GFX7-NEXT:    s_endpgm
5814;
5815; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5816; GFX10-WGP:       ; %bb.0: ; %entry
5817; GFX10-WGP-NEXT:    s_clause 0x1
5818; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5819; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5820; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5821; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5822; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5823; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5824; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5825; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5826; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5827; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5828; GFX10-WGP-NEXT:    s_endpgm
5829;
5830; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5831; GFX10-CU:       ; %bb.0: ; %entry
5832; GFX10-CU-NEXT:    s_clause 0x1
5833; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5834; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5835; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5836; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5837; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5838; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5839; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5840; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5841; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5842; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5843; GFX10-CU-NEXT:    s_endpgm
5844;
5845; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5846; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5847; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5848; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5849; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5850; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5851; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5852; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5853; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5854; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5855; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5856; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5857; SKIP-CACHE-INV-NEXT:    s_endpgm
5858;
5859; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5860; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5861; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5862; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5863; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5864; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5865; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5866; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5867; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5868;
5869; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
5870; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5871; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5872; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5873; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5874; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5875; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5876; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5877; GFX90A-TGSPLIT-NEXT:    s_endpgm
5878    i32* %out, i32 %in, i32 %old) {
5879entry:
5880  %gep = getelementptr i32, i32* %out, i32 4
5881  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
5882  ret void
5883}
5884
5885define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
5886; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5887; GFX7:       ; %bb.0: ; %entry
5888; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5889; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5890; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5891; GFX7-NEXT:    s_add_u32 s0, s0, 16
5892; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5893; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5894; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5895; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5896; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5897; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5898; GFX7-NEXT:    s_endpgm
5899;
5900; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5901; GFX10-WGP:       ; %bb.0: ; %entry
5902; GFX10-WGP-NEXT:    s_clause 0x1
5903; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5904; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5905; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5906; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5907; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5908; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5909; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5910; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5911; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5912; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5913; GFX10-WGP-NEXT:    s_endpgm
5914;
5915; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5916; GFX10-CU:       ; %bb.0: ; %entry
5917; GFX10-CU-NEXT:    s_clause 0x1
5918; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5919; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5920; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5921; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5922; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5923; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5924; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5925; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5926; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5927; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5928; GFX10-CU-NEXT:    s_endpgm
5929;
5930; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5931; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5932; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5933; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5934; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5935; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5936; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5937; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5938; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5939; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5941; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5942; SKIP-CACHE-INV-NEXT:    s_endpgm
5943;
5944; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5945; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5946; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5947; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5948; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5949; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5950; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5951; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5952; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5953;
5954; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
5955; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5956; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5957; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5958; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5959; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5960; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5961; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5962; GFX90A-TGSPLIT-NEXT:    s_endpgm
5963    i32* %out, i32 %in, i32 %old) {
5964entry:
5965  %gep = getelementptr i32, i32* %out, i32 4
5966  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
5967  ret void
5968}
5969
5970define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
5971; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
5972; GFX7:       ; %bb.0: ; %entry
5973; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5974; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5975; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5976; GFX7-NEXT:    s_add_u32 s0, s0, 16
5977; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5978; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5979; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5980; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5981; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5982; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5983; GFX7-NEXT:    s_endpgm
5984;
5985; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
5986; GFX10-WGP:       ; %bb.0: ; %entry
5987; GFX10-WGP-NEXT:    s_clause 0x1
5988; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5989; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5990; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5991; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5992; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5993; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5994; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5995; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5996; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5997; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5998; GFX10-WGP-NEXT:    s_endpgm
5999;
6000; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
6001; GFX10-CU:       ; %bb.0: ; %entry
6002; GFX10-CU-NEXT:    s_clause 0x1
6003; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6004; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6005; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6006; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6007; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6008; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6009; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6010; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6011; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6012; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6013; GFX10-CU-NEXT:    s_endpgm
6014;
6015; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
6016; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6017; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6018; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6019; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6020; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6021; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6022; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6023; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6024; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6026; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6027; SKIP-CACHE-INV-NEXT:    s_endpgm
6028;
6029; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
6030; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6031; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6032; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6033; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6034; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6035; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6036; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6037; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6038;
6039; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
6040; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6041; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6042; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6043; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6044; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6045; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6046; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6047; GFX90A-TGSPLIT-NEXT:    s_endpgm
6048    i32* %out, i32 %in, i32 %old) {
6049entry:
6050  %gep = getelementptr i32, i32* %out, i32 4
6051  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
6052  ret void
6053}
6054
6055define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
6056; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6057; GFX7:       ; %bb.0: ; %entry
6058; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6059; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6060; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6061; GFX7-NEXT:    s_add_u32 s0, s0, 16
6062; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6063; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6064; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6065; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6066; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6067; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6068; GFX7-NEXT:    s_endpgm
6069;
6070; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6071; GFX10-WGP:       ; %bb.0: ; %entry
6072; GFX10-WGP-NEXT:    s_clause 0x1
6073; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6074; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6075; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6076; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6077; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6078; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6079; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6080; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6081; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6082; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6083; GFX10-WGP-NEXT:    s_endpgm
6084;
6085; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6086; GFX10-CU:       ; %bb.0: ; %entry
6087; GFX10-CU-NEXT:    s_clause 0x1
6088; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6089; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6090; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6091; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6092; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6093; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6094; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6095; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6096; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6097; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6098; GFX10-CU-NEXT:    s_endpgm
6099;
6100; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6101; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6102; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6103; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6104; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6105; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6106; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6107; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6108; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6109; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6111; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6112; SKIP-CACHE-INV-NEXT:    s_endpgm
6113;
6114; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6115; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6116; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6117; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6118; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6119; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6120; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6121; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6122; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6123;
6124; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
6125; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6126; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6127; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6128; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6129; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6130; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6131; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6132; GFX90A-TGSPLIT-NEXT:    s_endpgm
6133    i32* %out, i32 %in, i32 %old) {
6134entry:
6135  %gep = getelementptr i32, i32* %out, i32 4
6136  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
6137  ret void
6138}
6139
6140define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
6141; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6142; GFX7:       ; %bb.0: ; %entry
6143; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6144; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6146; GFX7-NEXT:    s_add_u32 s0, s0, 16
6147; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6148; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6149; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6150; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6151; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6152; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6153; GFX7-NEXT:    s_endpgm
6154;
6155; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6156; GFX10-WGP:       ; %bb.0: ; %entry
6157; GFX10-WGP-NEXT:    s_clause 0x1
6158; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6159; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6160; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6161; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6162; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6163; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6164; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6165; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6166; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6167; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6168; GFX10-WGP-NEXT:    s_endpgm
6169;
6170; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6171; GFX10-CU:       ; %bb.0: ; %entry
6172; GFX10-CU-NEXT:    s_clause 0x1
6173; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6174; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6175; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6176; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6177; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6178; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6179; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6180; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6181; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6182; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6183; GFX10-CU-NEXT:    s_endpgm
6184;
6185; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6186; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6187; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6188; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6189; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6190; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6191; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6192; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6193; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6194; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6195; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6196; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6197; SKIP-CACHE-INV-NEXT:    s_endpgm
6198;
6199; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6200; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6201; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6202; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6203; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6204; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6205; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6206; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6207; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6208;
6209; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
6210; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6211; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6212; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6213; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6214; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6215; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6216; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6217; GFX90A-TGSPLIT-NEXT:    s_endpgm
6218    i32* %out, i32 %in, i32 %old) {
6219entry:
6220  %gep = getelementptr i32, i32* %out, i32 4
6221  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
6222  ret void
6223}
6224
6225define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
6226; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6227; GFX7:       ; %bb.0: ; %entry
6228; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6229; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6230; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6231; GFX7-NEXT:    s_add_u32 s0, s0, 16
6232; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6233; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6234; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6235; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6236; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6237; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6238; GFX7-NEXT:    s_endpgm
6239;
6240; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6241; GFX10-WGP:       ; %bb.0: ; %entry
6242; GFX10-WGP-NEXT:    s_clause 0x1
6243; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6244; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6245; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6246; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6247; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6248; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6249; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6250; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6251; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6252; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6253; GFX10-WGP-NEXT:    s_endpgm
6254;
6255; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6256; GFX10-CU:       ; %bb.0: ; %entry
6257; GFX10-CU-NEXT:    s_clause 0x1
6258; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6259; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6260; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6261; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6262; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6263; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6264; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6265; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6266; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6267; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6268; GFX10-CU-NEXT:    s_endpgm
6269;
6270; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6271; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6272; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6273; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6274; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6275; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6276; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6277; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6278; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6279; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6280; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6281; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6282; SKIP-CACHE-INV-NEXT:    s_endpgm
6283;
6284; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6285; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6286; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6288; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6289; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6290; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6291; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6292; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6293;
6294; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
6295; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6296; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6297; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6298; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6299; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6300; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6301; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6302; GFX90A-TGSPLIT-NEXT:    s_endpgm
6303    i32* %out, i32 %in, i32 %old) {
6304entry:
6305  %gep = getelementptr i32, i32* %out, i32 4
6306  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
6307  ret void
6308}
6309
6310define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
6311; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6312; GFX7:       ; %bb.0: ; %entry
6313; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6314; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6315; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6316; GFX7-NEXT:    s_add_u32 s0, s0, 16
6317; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6318; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6319; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6320; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6321; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6322; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6323; GFX7-NEXT:    s_endpgm
6324;
6325; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6326; GFX10-WGP:       ; %bb.0: ; %entry
6327; GFX10-WGP-NEXT:    s_clause 0x1
6328; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6329; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6330; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6331; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6332; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6333; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6334; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6335; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6336; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6337; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6338; GFX10-WGP-NEXT:    s_endpgm
6339;
6340; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6341; GFX10-CU:       ; %bb.0: ; %entry
6342; GFX10-CU-NEXT:    s_clause 0x1
6343; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6344; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6345; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6346; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6347; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6348; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6349; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6350; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6351; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6352; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6353; GFX10-CU-NEXT:    s_endpgm
6354;
6355; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6356; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6357; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6358; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6359; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6360; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6361; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6362; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6363; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6364; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6365; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6366; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6367; SKIP-CACHE-INV-NEXT:    s_endpgm
6368;
6369; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6370; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6371; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6372; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6373; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6374; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6375; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6376; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6377; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6378;
6379; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
6380; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6381; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6382; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6383; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6384; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6385; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6386; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6387; GFX90A-TGSPLIT-NEXT:    s_endpgm
6388    i32* %out, i32 %in, i32 %old) {
6389entry:
6390  %gep = getelementptr i32, i32* %out, i32 4
6391  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
6392  ret void
6393}
6394
6395define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
6396; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6397; GFX7:       ; %bb.0: ; %entry
6398; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6399; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6400; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6401; GFX7-NEXT:    s_add_u32 s0, s0, 16
6402; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6403; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6404; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6405; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6406; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6407; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6408; GFX7-NEXT:    s_endpgm
6409;
6410; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6411; GFX10-WGP:       ; %bb.0: ; %entry
6412; GFX10-WGP-NEXT:    s_clause 0x1
6413; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6414; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6415; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6416; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6417; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6418; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6419; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6420; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6421; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6422; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6423; GFX10-WGP-NEXT:    s_endpgm
6424;
6425; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6426; GFX10-CU:       ; %bb.0: ; %entry
6427; GFX10-CU-NEXT:    s_clause 0x1
6428; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6429; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6430; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6431; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6432; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6433; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6434; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6436; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6437; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6438; GFX10-CU-NEXT:    s_endpgm
6439;
6440; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6444; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6445; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6446; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6450; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6451; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6452; SKIP-CACHE-INV-NEXT:    s_endpgm
6453;
6454; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6455; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6456; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6457; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6458; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6459; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6460; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6461; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6462; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6463;
6464; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
6465; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6466; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6467; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6468; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6469; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6470; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6471; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6472; GFX90A-TGSPLIT-NEXT:    s_endpgm
6473    i32* %out, i32 %in, i32 %old) {
6474entry:
6475  %gep = getelementptr i32, i32* %out, i32 4
6476  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
6477  ret void
6478}
6479
6480define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
6481; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6482; GFX7:       ; %bb.0: ; %entry
6483; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6484; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6486; GFX7-NEXT:    s_add_u32 s0, s0, 16
6487; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6488; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6489; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6490; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6491; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6492; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6493; GFX7-NEXT:    s_endpgm
6494;
6495; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6496; GFX10-WGP:       ; %bb.0: ; %entry
6497; GFX10-WGP-NEXT:    s_clause 0x1
6498; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6499; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6500; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6501; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6502; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6503; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6504; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6505; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6506; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6507; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6508; GFX10-WGP-NEXT:    s_endpgm
6509;
6510; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6511; GFX10-CU:       ; %bb.0: ; %entry
6512; GFX10-CU-NEXT:    s_clause 0x1
6513; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6514; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6515; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6516; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6517; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6518; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6519; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6520; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6521; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6522; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6523; GFX10-CU-NEXT:    s_endpgm
6524;
6525; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6526; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6527; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6528; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6529; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6530; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6531; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6532; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6536; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6537; SKIP-CACHE-INV-NEXT:    s_endpgm
6538;
6539; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6540; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6541; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6542; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6543; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6544; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6545; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6546; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6547; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6548;
6549; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
6550; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6551; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6552; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6553; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6554; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6555; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6556; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6557; GFX90A-TGSPLIT-NEXT:    s_endpgm
6558    i32* %out, i32 %in, i32 %old) {
6559entry:
6560  %gep = getelementptr i32, i32* %out, i32 4
6561  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
6562  ret void
6563}
6564
6565define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
6566; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6567; GFX7:       ; %bb.0: ; %entry
6568; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6569; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6570; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6571; GFX7-NEXT:    s_add_u32 s0, s0, 16
6572; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6573; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6574; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6575; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6576; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6577; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6578; GFX7-NEXT:    s_endpgm
6579;
6580; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6581; GFX10-WGP:       ; %bb.0: ; %entry
6582; GFX10-WGP-NEXT:    s_clause 0x1
6583; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6584; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6585; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6586; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6587; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6588; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6589; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6590; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6591; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6592; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6593; GFX10-WGP-NEXT:    s_endpgm
6594;
6595; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6596; GFX10-CU:       ; %bb.0: ; %entry
6597; GFX10-CU-NEXT:    s_clause 0x1
6598; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6599; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6600; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6601; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6602; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6603; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6604; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6605; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6606; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6607; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6608; GFX10-CU-NEXT:    s_endpgm
6609;
6610; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6611; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6612; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6613; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6614; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6615; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6616; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6619; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6620; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6621; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6622; SKIP-CACHE-INV-NEXT:    s_endpgm
6623;
6624; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6625; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6626; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6627; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6628; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6629; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6630; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6631; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6632; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6633;
6634; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
6635; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6636; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6637; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6638; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6639; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6640; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6641; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6642; GFX90A-TGSPLIT-NEXT:    s_endpgm
6643    i32* %out, i32 %in, i32 %old) {
6644entry:
6645  %gep = getelementptr i32, i32* %out, i32 4
6646  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
6647  ret void
6648}
6649
6650define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
6651; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6652; GFX7:       ; %bb.0: ; %entry
6653; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6654; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6655; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6656; GFX7-NEXT:    s_add_u32 s4, s0, 16
6657; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6658; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6659; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6660; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6661; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6662; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6663; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6664; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6665; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6666; GFX7-NEXT:    flat_store_dword v[0:1], v2
6667; GFX7-NEXT:    s_endpgm
6668;
6669; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6670; GFX10-WGP:       ; %bb.0: ; %entry
6671; GFX10-WGP-NEXT:    s_clause 0x1
6672; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6673; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6674; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6675; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6676; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6677; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6678; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6679; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6680; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6681; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6682; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6683; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6684; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6685; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6686; GFX10-WGP-NEXT:    s_endpgm
6687;
6688; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6689; GFX10-CU:       ; %bb.0: ; %entry
6690; GFX10-CU-NEXT:    s_clause 0x1
6691; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6692; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6693; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6694; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
6695; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
6696; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6697; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6698; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6699; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6700; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6701; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6702; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6703; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6704; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6705; GFX10-CU-NEXT:    s_endpgm
6706;
6707; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6708; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6709; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6710; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6711; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6712; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
6713; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
6714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6716; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6717; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6718; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6719; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6720; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6721; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6722; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6723; SKIP-CACHE-INV-NEXT:    s_endpgm
6724;
6725; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6726; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6727; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6728; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6729; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6730; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6731; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6732; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6733; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6734; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6735; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6736;
6737; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
6738; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6739; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6740; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6741; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6742; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6743; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6744; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6745; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6746; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6747; GFX90A-TGSPLIT-NEXT:    s_endpgm
6748    i32* %out, i32 %in, i32 %old) {
6749entry:
6750  %gep = getelementptr i32, i32* %out, i32 4
6751  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
6752  %val0 = extractvalue { i32, i1 } %val, 0
6753  store i32 %val0, i32* %out, align 4
6754  ret void
6755}
6756
6757define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
6758; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6759; GFX7:       ; %bb.0: ; %entry
6760; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6761; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6762; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6763; GFX7-NEXT:    s_add_u32 s4, s0, 16
6764; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6765; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6766; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6767; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6768; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6769; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6770; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6771; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6772; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6773; GFX7-NEXT:    flat_store_dword v[0:1], v2
6774; GFX7-NEXT:    s_endpgm
6775;
6776; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6777; GFX10-WGP:       ; %bb.0: ; %entry
6778; GFX10-WGP-NEXT:    s_clause 0x1
6779; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6780; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6781; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6782; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6783; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6784; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6785; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6786; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6787; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6788; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6789; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6790; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6791; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6792; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6793; GFX10-WGP-NEXT:    s_endpgm
6794;
6795; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6796; GFX10-CU:       ; %bb.0: ; %entry
6797; GFX10-CU-NEXT:    s_clause 0x1
6798; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6799; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6800; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6801; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
6802; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
6803; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6804; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6805; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6806; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6807; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6808; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6809; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6810; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6811; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6812; GFX10-CU-NEXT:    s_endpgm
6813;
6814; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6815; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6816; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6817; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6818; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6819; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
6820; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
6821; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6822; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6823; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6824; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6825; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6826; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6827; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6828; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6829; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6830; SKIP-CACHE-INV-NEXT:    s_endpgm
6831;
6832; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6833; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6834; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6835; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6836; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6837; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6838; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6839; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6840; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6841; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6842; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6843;
6844; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
6845; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6846; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6847; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6848; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6849; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6850; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6851; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6852; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6853; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6854; GFX90A-TGSPLIT-NEXT:    s_endpgm
6855    i32* %out, i32 %in, i32 %old) {
6856entry:
6857  %gep = getelementptr i32, i32* %out, i32 4
6858  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
6859  %val0 = extractvalue { i32, i1 } %val, 0
6860  store i32 %val0, i32* %out, align 4
6861  ret void
6862}
6863
6864define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
6865; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6866; GFX7:       ; %bb.0: ; %entry
6867; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6868; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6869; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6870; GFX7-NEXT:    s_add_u32 s4, s0, 16
6871; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6872; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6873; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6874; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6875; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6876; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6877; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6878; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6879; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6880; GFX7-NEXT:    flat_store_dword v[0:1], v2
6881; GFX7-NEXT:    s_endpgm
6882;
6883; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6884; GFX10-WGP:       ; %bb.0: ; %entry
6885; GFX10-WGP-NEXT:    s_clause 0x1
6886; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6887; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6888; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6889; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6890; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6891; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6892; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6893; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6894; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6895; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6896; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6897; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6898; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6899; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6900; GFX10-WGP-NEXT:    s_endpgm
6901;
6902; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6903; GFX10-CU:       ; %bb.0: ; %entry
6904; GFX10-CU-NEXT:    s_clause 0x1
6905; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6906; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6907; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6908; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
6909; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
6910; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6911; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6912; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6913; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6914; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6915; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6916; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6917; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6918; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6919; GFX10-CU-NEXT:    s_endpgm
6920;
6921; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6922; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6923; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6924; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6925; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6926; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
6927; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
6928; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6929; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6930; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6931; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6932; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6934; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6935; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6936; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6937; SKIP-CACHE-INV-NEXT:    s_endpgm
6938;
6939; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6940; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6941; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6942; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6943; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6944; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6945; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6946; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6947; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6948; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6949; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6950;
6951; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
6952; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6953; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6954; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6955; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6956; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6957; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6958; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6959; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6960; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6961; GFX90A-TGSPLIT-NEXT:    s_endpgm
6962    i32* %out, i32 %in, i32 %old) {
6963entry:
6964  %gep = getelementptr i32, i32* %out, i32 4
6965  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
6966  %val0 = extractvalue { i32, i1 } %val, 0
6967  store i32 %val0, i32* %out, align 4
6968  ret void
6969}
6970
6971define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
6972; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
6973; GFX7:       ; %bb.0: ; %entry
6974; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6975; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6976; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6977; GFX7-NEXT:    s_add_u32 s4, s0, 16
6978; GFX7-NEXT:    s_addc_u32 s5, s1, 0
6979; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6980; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6981; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6982; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6983; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6984; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6985; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6986; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6987; GFX7-NEXT:    flat_store_dword v[0:1], v2
6988; GFX7-NEXT:    s_endpgm
6989;
6990; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
6991; GFX10-WGP:       ; %bb.0: ; %entry
6992; GFX10-WGP-NEXT:    s_clause 0x1
6993; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6994; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6995; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6996; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
6997; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
6998; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6999; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7000; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7001; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7002; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7003; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7004; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7005; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7006; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7007; GFX10-WGP-NEXT:    s_endpgm
7008;
7009; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
7010; GFX10-CU:       ; %bb.0: ; %entry
7011; GFX10-CU-NEXT:    s_clause 0x1
7012; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7013; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7014; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7015; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7016; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7017; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7018; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7019; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7020; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7021; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7022; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7023; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7024; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7025; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7026; GFX10-CU-NEXT:    s_endpgm
7027;
7028; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
7029; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7030; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7031; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7032; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7033; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7034; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7036; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7037; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7038; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7039; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7040; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7041; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7042; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7043; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7044; SKIP-CACHE-INV-NEXT:    s_endpgm
7045;
7046; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
7047; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7048; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7049; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7050; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7051; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7052; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7053; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7054; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7055; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7056; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7057;
7058; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
7059; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7060; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7061; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7062; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7063; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7064; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7065; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7066; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7067; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7068; GFX90A-TGSPLIT-NEXT:    s_endpgm
7069    i32* %out, i32 %in, i32 %old) {
7070entry:
7071  %gep = getelementptr i32, i32* %out, i32 4
7072  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
7073  %val0 = extractvalue { i32, i1 } %val, 0
7074  store i32 %val0, i32* %out, align 4
7075  ret void
7076}
7077
7078define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
7079; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7080; GFX7:       ; %bb.0: ; %entry
7081; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7082; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7083; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7084; GFX7-NEXT:    s_add_u32 s4, s0, 16
7085; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7086; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7087; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7088; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7089; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7090; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7091; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7092; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7093; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7094; GFX7-NEXT:    flat_store_dword v[0:1], v2
7095; GFX7-NEXT:    s_endpgm
7096;
7097; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7098; GFX10-WGP:       ; %bb.0: ; %entry
7099; GFX10-WGP-NEXT:    s_clause 0x1
7100; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7101; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7102; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7103; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7104; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7105; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7106; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7107; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7108; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7109; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7110; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7111; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7112; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7113; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7114; GFX10-WGP-NEXT:    s_endpgm
7115;
7116; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7117; GFX10-CU:       ; %bb.0: ; %entry
7118; GFX10-CU-NEXT:    s_clause 0x1
7119; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7120; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7121; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7122; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7123; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7124; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7125; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7126; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7127; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7128; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7129; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7130; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7131; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7132; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7133; GFX10-CU-NEXT:    s_endpgm
7134;
7135; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7136; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7137; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7138; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7139; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7140; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7141; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7145; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7146; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7147; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7148; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7149; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7150; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7151; SKIP-CACHE-INV-NEXT:    s_endpgm
7152;
7153; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7154; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7155; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7156; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7157; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7158; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7159; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7160; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7161; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7162; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7163; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7164;
7165; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
7166; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7167; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7168; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7169; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7170; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7171; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7172; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7173; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7174; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7175; GFX90A-TGSPLIT-NEXT:    s_endpgm
7176    i32* %out, i32 %in, i32 %old) {
7177entry:
7178  %gep = getelementptr i32, i32* %out, i32 4
7179  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
7180  %val0 = extractvalue { i32, i1 } %val, 0
7181  store i32 %val0, i32* %out, align 4
7182  ret void
7183}
7184
7185define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
7186; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7187; GFX7:       ; %bb.0: ; %entry
7188; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7189; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7190; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7191; GFX7-NEXT:    s_add_u32 s4, s0, 16
7192; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7193; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7194; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7195; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7196; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7197; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7198; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7199; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7200; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7201; GFX7-NEXT:    flat_store_dword v[0:1], v2
7202; GFX7-NEXT:    s_endpgm
7203;
7204; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7205; GFX10-WGP:       ; %bb.0: ; %entry
7206; GFX10-WGP-NEXT:    s_clause 0x1
7207; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7208; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7209; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7210; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7211; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7212; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7213; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7214; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7215; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7216; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7219; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7220; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7221; GFX10-WGP-NEXT:    s_endpgm
7222;
7223; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7224; GFX10-CU:       ; %bb.0: ; %entry
7225; GFX10-CU-NEXT:    s_clause 0x1
7226; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7227; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7228; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7229; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7230; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7232; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7233; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7234; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7235; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7236; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7237; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7238; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7239; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7240; GFX10-CU-NEXT:    s_endpgm
7241;
7242; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7243; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7244; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7245; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7246; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7247; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7248; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7252; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7253; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7255; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7256; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7257; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7258; SKIP-CACHE-INV-NEXT:    s_endpgm
7259;
7260; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7261; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7262; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7263; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7264; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7265; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7266; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7267; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7268; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7269; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7270; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7271;
7272; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
7273; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7275; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7276; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7277; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7278; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7279; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7280; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7281; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7282; GFX90A-TGSPLIT-NEXT:    s_endpgm
7283    i32* %out, i32 %in, i32 %old) {
7284entry:
7285  %gep = getelementptr i32, i32* %out, i32 4
7286  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
7287  %val0 = extractvalue { i32, i1 } %val, 0
7288  store i32 %val0, i32* %out, align 4
7289  ret void
7290}
7291
7292define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
7293; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7294; GFX7:       ; %bb.0: ; %entry
7295; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7296; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7297; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7298; GFX7-NEXT:    s_add_u32 s4, s0, 16
7299; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7300; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7301; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7302; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7303; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7304; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7305; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7306; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7307; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7308; GFX7-NEXT:    flat_store_dword v[0:1], v2
7309; GFX7-NEXT:    s_endpgm
7310;
7311; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7312; GFX10-WGP:       ; %bb.0: ; %entry
7313; GFX10-WGP-NEXT:    s_clause 0x1
7314; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7315; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7316; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7317; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7318; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7319; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7320; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7321; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7322; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7323; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7324; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7325; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7326; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7327; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7328; GFX10-WGP-NEXT:    s_endpgm
7329;
7330; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7331; GFX10-CU:       ; %bb.0: ; %entry
7332; GFX10-CU-NEXT:    s_clause 0x1
7333; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7334; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7335; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7336; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7337; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7338; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7339; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7340; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7341; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7342; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7343; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7344; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7345; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7346; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7347; GFX10-CU-NEXT:    s_endpgm
7348;
7349; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7350; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7351; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7352; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7353; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7354; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7355; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7357; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7360; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7361; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7362; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7363; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7364; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7365; SKIP-CACHE-INV-NEXT:    s_endpgm
7366;
7367; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7368; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7369; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7370; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7371; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7372; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7373; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7374; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7375; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7376; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7377; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7378;
7379; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
7380; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7381; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7382; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7383; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7384; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7385; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7386; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7387; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7388; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7389; GFX90A-TGSPLIT-NEXT:    s_endpgm
7390    i32* %out, i32 %in, i32 %old) {
7391entry:
7392  %gep = getelementptr i32, i32* %out, i32 4
7393  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
7394  %val0 = extractvalue { i32, i1 } %val, 0
7395  store i32 %val0, i32* %out, align 4
7396  ret void
7397}
7398
7399define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
7400; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7401; GFX7:       ; %bb.0: ; %entry
7402; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7403; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7404; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7405; GFX7-NEXT:    s_add_u32 s4, s0, 16
7406; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7407; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7408; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7409; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7410; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7411; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7412; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7413; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7414; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7415; GFX7-NEXT:    flat_store_dword v[0:1], v2
7416; GFX7-NEXT:    s_endpgm
7417;
7418; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7419; GFX10-WGP:       ; %bb.0: ; %entry
7420; GFX10-WGP-NEXT:    s_clause 0x1
7421; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7422; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7423; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7424; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7425; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7426; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7427; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7428; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7429; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7430; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7431; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7432; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7433; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7434; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7435; GFX10-WGP-NEXT:    s_endpgm
7436;
7437; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7438; GFX10-CU:       ; %bb.0: ; %entry
7439; GFX10-CU-NEXT:    s_clause 0x1
7440; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7441; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7442; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7443; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7444; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7445; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7446; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7447; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7448; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7449; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7450; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7451; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7452; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7453; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7454; GFX10-CU-NEXT:    s_endpgm
7455;
7456; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7457; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7458; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7459; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7460; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7461; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7462; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7463; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7464; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7465; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7466; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7467; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7468; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7469; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7470; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7471; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7472; SKIP-CACHE-INV-NEXT:    s_endpgm
7473;
7474; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7475; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7476; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7477; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7478; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7479; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7480; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7481; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7482; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7483; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7484; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7485;
7486; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
7487; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7488; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7489; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7490; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7491; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7492; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7493; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7494; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7495; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7496; GFX90A-TGSPLIT-NEXT:    s_endpgm
7497    i32* %out, i32 %in, i32 %old) {
7498entry:
7499  %gep = getelementptr i32, i32* %out, i32 4
7500  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
7501  %val0 = extractvalue { i32, i1 } %val, 0
7502  store i32 %val0, i32* %out, align 4
7503  ret void
7504}
7505
7506define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
7507; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7508; GFX7:       ; %bb.0: ; %entry
7509; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7510; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7511; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7512; GFX7-NEXT:    s_add_u32 s4, s0, 16
7513; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7514; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7515; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7516; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7517; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7518; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7519; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7520; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7521; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7522; GFX7-NEXT:    flat_store_dword v[0:1], v2
7523; GFX7-NEXT:    s_endpgm
7524;
7525; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7526; GFX10-WGP:       ; %bb.0: ; %entry
7527; GFX10-WGP-NEXT:    s_clause 0x1
7528; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7529; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7530; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7531; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7532; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7533; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7534; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7535; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7536; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7537; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7538; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7539; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7540; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7541; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7542; GFX10-WGP-NEXT:    s_endpgm
7543;
7544; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7545; GFX10-CU:       ; %bb.0: ; %entry
7546; GFX10-CU-NEXT:    s_clause 0x1
7547; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7548; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7549; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7550; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7551; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7552; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7553; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7554; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7555; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7556; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7557; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7558; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7559; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7560; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7561; GFX10-CU-NEXT:    s_endpgm
7562;
7563; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7564; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7565; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7566; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7567; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7568; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7569; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7571; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7572; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7573; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7574; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7576; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7577; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7578; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7579; SKIP-CACHE-INV-NEXT:    s_endpgm
7580;
7581; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7582; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7583; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7584; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7585; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7586; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7587; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7588; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7589; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7590; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7591; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7592;
7593; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
7594; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7595; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7596; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7597; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7598; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7599; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7600; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7601; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7602; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7603; GFX90A-TGSPLIT-NEXT:    s_endpgm
7604    i32* %out, i32 %in, i32 %old) {
7605entry:
7606  %gep = getelementptr i32, i32* %out, i32 4
7607  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
7608  %val0 = extractvalue { i32, i1 } %val, 0
7609  store i32 %val0, i32* %out, align 4
7610  ret void
7611}
7612
7613define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
7614; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7615; GFX7:       ; %bb.0: ; %entry
7616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7617; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7619; GFX7-NEXT:    s_add_u32 s4, s0, 16
7620; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7621; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7623; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7624; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7625; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7626; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7627; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7628; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7629; GFX7-NEXT:    flat_store_dword v[0:1], v2
7630; GFX7-NEXT:    s_endpgm
7631;
7632; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7633; GFX10-WGP:       ; %bb.0: ; %entry
7634; GFX10-WGP-NEXT:    s_clause 0x1
7635; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7636; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7637; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7638; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7639; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7640; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7641; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7642; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7643; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7644; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7645; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7646; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7647; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7648; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7649; GFX10-WGP-NEXT:    s_endpgm
7650;
7651; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7652; GFX10-CU:       ; %bb.0: ; %entry
7653; GFX10-CU-NEXT:    s_clause 0x1
7654; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7655; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7656; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7657; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7658; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7659; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7660; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7661; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7662; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7663; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7664; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7665; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7666; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7667; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7668; GFX10-CU-NEXT:    s_endpgm
7669;
7670; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7671; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7672; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7673; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7674; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7675; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7676; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7679; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7681; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7684; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7685; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7686; SKIP-CACHE-INV-NEXT:    s_endpgm
7687;
7688; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7689; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7690; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7691; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7692; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7693; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7694; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7695; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7696; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7697; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7698; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7699;
7700; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
7701; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7702; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7703; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7704; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7705; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7706; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7707; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7708; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7709; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7710; GFX90A-TGSPLIT-NEXT:    s_endpgm
7711    i32* %out, i32 %in, i32 %old) {
7712entry:
7713  %gep = getelementptr i32, i32* %out, i32 4
7714  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
7715  %val0 = extractvalue { i32, i1 } %val, 0
7716  store i32 %val0, i32* %out, align 4
7717  ret void
7718}
7719
7720define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
7721; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7722; GFX7:       ; %bb.0: ; %entry
7723; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7724; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7725; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7726; GFX7-NEXT:    s_add_u32 s4, s0, 16
7727; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7728; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7729; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7730; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7731; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7732; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7733; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7734; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7735; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7736; GFX7-NEXT:    flat_store_dword v[0:1], v2
7737; GFX7-NEXT:    s_endpgm
7738;
7739; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7740; GFX10-WGP:       ; %bb.0: ; %entry
7741; GFX10-WGP-NEXT:    s_clause 0x1
7742; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7743; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7744; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7745; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7746; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7747; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7748; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7749; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7750; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7751; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7752; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7753; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7754; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7755; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7756; GFX10-WGP-NEXT:    s_endpgm
7757;
7758; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7759; GFX10-CU:       ; %bb.0: ; %entry
7760; GFX10-CU-NEXT:    s_clause 0x1
7761; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7762; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7763; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7764; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7765; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7766; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7767; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7768; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7769; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7770; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7771; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7772; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7773; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7774; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7775; GFX10-CU-NEXT:    s_endpgm
7776;
7777; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7778; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7779; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7780; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7781; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7782; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7783; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7787; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7788; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7789; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7790; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7791; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7792; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7793; SKIP-CACHE-INV-NEXT:    s_endpgm
7794;
7795; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7796; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7797; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7798; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7799; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7800; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7801; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7802; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7803; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7804; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7805; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7806;
7807; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
7808; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7809; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7810; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7811; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7812; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7813; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7814; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7815; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7816; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7817; GFX90A-TGSPLIT-NEXT:    s_endpgm
7818    i32* %out, i32 %in, i32 %old) {
7819entry:
7820  %gep = getelementptr i32, i32* %out, i32 4
7821  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
7822  %val0 = extractvalue { i32, i1 } %val, 0
7823  store i32 %val0, i32* %out, align 4
7824  ret void
7825}
7826
7827define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
7828; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7829; GFX7:       ; %bb.0: ; %entry
7830; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7831; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7832; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7833; GFX7-NEXT:    s_add_u32 s4, s0, 16
7834; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7835; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7836; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7837; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7838; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7839; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7840; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7841; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7842; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7843; GFX7-NEXT:    flat_store_dword v[0:1], v2
7844; GFX7-NEXT:    s_endpgm
7845;
7846; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7847; GFX10-WGP:       ; %bb.0: ; %entry
7848; GFX10-WGP-NEXT:    s_clause 0x1
7849; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7850; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7851; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7852; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7853; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7854; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7855; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7856; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7857; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7858; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7859; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7860; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7861; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7862; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7863; GFX10-WGP-NEXT:    s_endpgm
7864;
7865; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7866; GFX10-CU:       ; %bb.0: ; %entry
7867; GFX10-CU-NEXT:    s_clause 0x1
7868; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7869; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7870; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7871; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7872; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7873; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7874; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7875; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7876; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7877; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7878; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7879; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7880; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7881; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7882; GFX10-CU-NEXT:    s_endpgm
7883;
7884; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7885; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7886; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7887; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7888; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7889; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7890; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7891; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7893; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7895; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7896; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7897; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7898; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7899; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7900; SKIP-CACHE-INV-NEXT:    s_endpgm
7901;
7902; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7903; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7904; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7905; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7906; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7907; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7908; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7909; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7910; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7911; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7912; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7913;
7914; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
7915; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7916; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7917; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7918; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7919; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7920; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7921; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7922; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7923; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7924; GFX90A-TGSPLIT-NEXT:    s_endpgm
7925    i32* %out, i32 %in, i32 %old) {
7926entry:
7927  %gep = getelementptr i32, i32* %out, i32 4
7928  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
7929  %val0 = extractvalue { i32, i1 } %val, 0
7930  store i32 %val0, i32* %out, align 4
7931  ret void
7932}
7933
7934define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
7935; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
7936; GFX7:       ; %bb.0: ; %entry
7937; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7938; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7939; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7940; GFX7-NEXT:    s_add_u32 s4, s0, 16
7941; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7942; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7943; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7944; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7945; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7946; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7947; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7948; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7949; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7950; GFX7-NEXT:    flat_store_dword v[0:1], v2
7951; GFX7-NEXT:    s_endpgm
7952;
7953; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
7954; GFX10-WGP:       ; %bb.0: ; %entry
7955; GFX10-WGP-NEXT:    s_clause 0x1
7956; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7957; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7958; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7959; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7960; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7961; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7962; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7963; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7964; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7965; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7966; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7967; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7968; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7969; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7970; GFX10-WGP-NEXT:    s_endpgm
7971;
7972; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
7973; GFX10-CU:       ; %bb.0: ; %entry
7974; GFX10-CU-NEXT:    s_clause 0x1
7975; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7976; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7977; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7978; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7979; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7981; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7982; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7983; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7984; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7985; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7986; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7987; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7988; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7989; GFX10-CU-NEXT:    s_endpgm
7990;
7991; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
7992; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7993; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7994; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7995; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7996; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7997; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7998; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7999; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8000; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8001; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8002; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8003; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8004; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8005; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8006; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8007; SKIP-CACHE-INV-NEXT:    s_endpgm
8008;
8009; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
8010; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8011; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8012; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8013; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8014; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8015; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8016; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8017; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8018; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8019; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8020;
8021; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
8022; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8023; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8024; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8025; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8026; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8027; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8028; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8029; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8030; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8031; GFX90A-TGSPLIT-NEXT:    s_endpgm
8032    i32* %out, i32 %in, i32 %old) {
8033entry:
8034  %gep = getelementptr i32, i32* %out, i32 4
8035  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
8036  %val0 = extractvalue { i32, i1 } %val, 0
8037  store i32 %val0, i32* %out, align 4
8038  ret void
8039}
8040
8041define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
8042; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8043; GFX7:       ; %bb.0: ; %entry
8044; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8045; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8046; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8047; GFX7-NEXT:    s_add_u32 s4, s0, 16
8048; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8049; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8050; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8051; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8052; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8053; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8054; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8055; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8056; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8057; GFX7-NEXT:    flat_store_dword v[0:1], v2
8058; GFX7-NEXT:    s_endpgm
8059;
8060; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8061; GFX10-WGP:       ; %bb.0: ; %entry
8062; GFX10-WGP-NEXT:    s_clause 0x1
8063; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8064; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8065; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8066; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8067; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8068; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8069; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8070; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8071; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8072; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8073; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8074; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8075; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8076; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8077; GFX10-WGP-NEXT:    s_endpgm
8078;
8079; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8080; GFX10-CU:       ; %bb.0: ; %entry
8081; GFX10-CU-NEXT:    s_clause 0x1
8082; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8083; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8084; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8085; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8086; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8087; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8088; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8089; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8090; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8091; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8092; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8093; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8094; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8095; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8096; GFX10-CU-NEXT:    s_endpgm
8097;
8098; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8099; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8100; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8101; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8102; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8103; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8104; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8105; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8106; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8107; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8108; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8109; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8111; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8112; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8113; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8114; SKIP-CACHE-INV-NEXT:    s_endpgm
8115;
8116; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8117; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8118; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8119; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8120; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8121; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8122; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8123; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8124; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8125; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8126; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8127;
8128; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
8129; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8130; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8131; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8132; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8133; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8134; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8135; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8136; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8137; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8138; GFX90A-TGSPLIT-NEXT:    s_endpgm
8139    i32* %out, i32 %in, i32 %old) {
8140entry:
8141  %gep = getelementptr i32, i32* %out, i32 4
8142  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
8143  %val0 = extractvalue { i32, i1 } %val, 0
8144  store i32 %val0, i32* %out, align 4
8145  ret void
8146}
8147
8148define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
8149; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8150; GFX7:       ; %bb.0: ; %entry
8151; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8152; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8154; GFX7-NEXT:    s_add_u32 s4, s0, 16
8155; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8156; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8157; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8158; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8159; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8160; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8161; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8162; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8163; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8164; GFX7-NEXT:    flat_store_dword v[0:1], v2
8165; GFX7-NEXT:    s_endpgm
8166;
8167; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8168; GFX10-WGP:       ; %bb.0: ; %entry
8169; GFX10-WGP-NEXT:    s_clause 0x1
8170; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8171; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8172; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8173; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8174; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8175; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8176; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8177; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8178; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8179; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8180; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8181; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8182; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8183; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8184; GFX10-WGP-NEXT:    s_endpgm
8185;
8186; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8187; GFX10-CU:       ; %bb.0: ; %entry
8188; GFX10-CU-NEXT:    s_clause 0x1
8189; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8190; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8191; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8192; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8193; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8194; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8195; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8196; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8197; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8198; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8199; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8200; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8201; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8202; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8203; GFX10-CU-NEXT:    s_endpgm
8204;
8205; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8206; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8207; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8208; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8209; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8210; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8211; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8212; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8216; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8217; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8218; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8219; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8220; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8221; SKIP-CACHE-INV-NEXT:    s_endpgm
8222;
8223; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8224; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8225; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8226; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8227; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8228; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8229; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8230; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8231; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8232; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8233; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8234;
8235; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
8236; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8237; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8238; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8239; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8240; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8241; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8242; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8243; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8244; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8245; GFX90A-TGSPLIT-NEXT:    s_endpgm
8246    i32* %out, i32 %in, i32 %old) {
8247entry:
8248  %gep = getelementptr i32, i32* %out, i32 4
8249  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
8250  %val0 = extractvalue { i32, i1 } %val, 0
8251  store i32 %val0, i32* %out, align 4
8252  ret void
8253}
8254
8255