1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8
9define amdgpu_kernel void @flat_wavefront_unordered_load(
10; GFX7-LABEL: flat_wavefront_unordered_load:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
13; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX7-NEXT:    v_mov_b32_e32 v0, s0
15; GFX7-NEXT:    v_mov_b32_e32 v1, s1
16; GFX7-NEXT:    flat_load_dword v0, v[0:1]
17; GFX7-NEXT:    v_mov_b32_e32 v2, s2
18; GFX7-NEXT:    v_mov_b32_e32 v3, s3
19; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20; GFX7-NEXT:    flat_store_dword v[2:3], v0
21; GFX7-NEXT:    s_endpgm
22;
23; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
24; GFX10-WGP:       ; %bb.0: ; %entry
25; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
26; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
28; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
29; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
30; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
31; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
32; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
33; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
34; GFX10-WGP-NEXT:    s_endpgm
35;
36; GFX10-CU-LABEL: flat_wavefront_unordered_load:
37; GFX10-CU:       ; %bb.0: ; %entry
38; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
39; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
43; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
44; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
45; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
46; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
47; GFX10-CU-NEXT:    s_endpgm
48;
49; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load:
50; SKIP-CACHE-INV:       ; %bb.0: ; %entry
51; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
52; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
53; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
54; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
55; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
56; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
57; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
58; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
59; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
60; SKIP-CACHE-INV-NEXT:    s_endpgm
61;
62; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
63; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
64; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
65; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
67; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
68; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
69; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
70; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
71; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
72; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
73; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
74;
75; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
76; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
77; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
78; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
80; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
81; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
82; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
83; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
84; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
85; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
86; GFX90A-TGSPLIT-NEXT:    s_endpgm
87;
88;
89    i32* %in, i32* %out) {
90entry:
91  %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
92  store i32 %val, i32* %out
93  ret void
94}
95
96define amdgpu_kernel void @flat_wavefront_monotonic_load(
97; GFX7-LABEL: flat_wavefront_monotonic_load:
98; GFX7:       ; %bb.0: ; %entry
99; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
100; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX7-NEXT:    v_mov_b32_e32 v0, s0
102; GFX7-NEXT:    v_mov_b32_e32 v1, s1
103; GFX7-NEXT:    flat_load_dword v0, v[0:1]
104; GFX7-NEXT:    v_mov_b32_e32 v2, s2
105; GFX7-NEXT:    v_mov_b32_e32 v3, s3
106; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
107; GFX7-NEXT:    flat_store_dword v[2:3], v0
108; GFX7-NEXT:    s_endpgm
109;
110; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
111; GFX10-WGP:       ; %bb.0: ; %entry
112; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
113; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
115; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
116; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
117; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
118; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
119; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
120; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
121; GFX10-WGP-NEXT:    s_endpgm
122;
123; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
124; GFX10-CU:       ; %bb.0: ; %entry
125; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
126; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
128; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
129; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
130; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
131; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
132; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
133; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
134; GFX10-CU-NEXT:    s_endpgm
135;
136; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load:
137; SKIP-CACHE-INV:       ; %bb.0: ; %entry
138; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
139; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
140; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
142; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
145; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
146; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
147; SKIP-CACHE-INV-NEXT:    s_endpgm
148;
149; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
150; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
151; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
152; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
154; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
155; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
156; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
157; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
158; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
159; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
160; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
161;
162; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
163; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
164; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
165; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
167; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
168; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
169; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
170; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
171; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
172; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
173; GFX90A-TGSPLIT-NEXT:    s_endpgm
174;
175;
176    i32* %in, i32* %out) {
177entry:
178  %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
179  store i32 %val, i32* %out
180  ret void
181}
182
183define amdgpu_kernel void @flat_wavefront_acquire_load(
184; GFX7-LABEL: flat_wavefront_acquire_load:
185; GFX7:       ; %bb.0: ; %entry
186; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX7-NEXT:    v_mov_b32_e32 v0, s0
189; GFX7-NEXT:    v_mov_b32_e32 v1, s1
190; GFX7-NEXT:    flat_load_dword v0, v[0:1]
191; GFX7-NEXT:    v_mov_b32_e32 v2, s2
192; GFX7-NEXT:    v_mov_b32_e32 v3, s3
193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
194; GFX7-NEXT:    flat_store_dword v[2:3], v0
195; GFX7-NEXT:    s_endpgm
196;
197; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
198; GFX10-WGP:       ; %bb.0: ; %entry
199; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
200; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
202; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
203; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
204; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
205; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
206; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
207; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
208; GFX10-WGP-NEXT:    s_endpgm
209;
210; GFX10-CU-LABEL: flat_wavefront_acquire_load:
211; GFX10-CU:       ; %bb.0: ; %entry
212; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
213; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
215; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
216; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
217; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
218; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
219; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
220; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
221; GFX10-CU-NEXT:    s_endpgm
222;
223; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load:
224; SKIP-CACHE-INV:       ; %bb.0: ; %entry
225; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
226; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
227; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
228; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
229; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
232; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
233; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
234; SKIP-CACHE-INV-NEXT:    s_endpgm
235;
236; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
237; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
238; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
239; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
241; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
242; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
243; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
244; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
246; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
247; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
248;
249; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
250; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
251; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
252; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
254; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
255; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
256; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
258; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
259; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
260; GFX90A-TGSPLIT-NEXT:    s_endpgm
261;
262;
263    i32* %in, i32* %out) {
264entry:
265  %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
266  store i32 %val, i32* %out
267  ret void
268}
269
270define amdgpu_kernel void @flat_wavefront_seq_cst_load(
271; GFX7-LABEL: flat_wavefront_seq_cst_load:
272; GFX7:       ; %bb.0: ; %entry
273; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
274; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX7-NEXT:    v_mov_b32_e32 v0, s0
276; GFX7-NEXT:    v_mov_b32_e32 v1, s1
277; GFX7-NEXT:    flat_load_dword v0, v[0:1]
278; GFX7-NEXT:    v_mov_b32_e32 v2, s2
279; GFX7-NEXT:    v_mov_b32_e32 v3, s3
280; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
281; GFX7-NEXT:    flat_store_dword v[2:3], v0
282; GFX7-NEXT:    s_endpgm
283;
284; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
285; GFX10-WGP:       ; %bb.0: ; %entry
286; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
287; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
289; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
290; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
291; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
292; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
293; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
294; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
295; GFX10-WGP-NEXT:    s_endpgm
296;
297; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
298; GFX10-CU:       ; %bb.0: ; %entry
299; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
300; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
302; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
303; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
304; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
305; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
306; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
307; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
308; GFX10-CU-NEXT:    s_endpgm
309;
310; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load:
311; SKIP-CACHE-INV:       ; %bb.0: ; %entry
312; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
313; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
314; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
316; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
317; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
318; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
319; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
320; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
321; SKIP-CACHE-INV-NEXT:    s_endpgm
322;
323; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
324; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
325; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
326; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
328; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
329; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
330; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
331; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
332; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
333; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
334; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
335;
336; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
337; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
338; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
339; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
342; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
343; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
344; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
345; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
346; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
347; GFX90A-TGSPLIT-NEXT:    s_endpgm
348;
349;
350    i32* %in, i32* %out) {
351entry:
352  %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
353  store i32 %val, i32* %out
354  ret void
355}
356
357define amdgpu_kernel void @flat_wavefront_unordered_store(
358; GFX7-LABEL: flat_wavefront_unordered_store:
359; GFX7:       ; %bb.0: ; %entry
360; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
361; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
362; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX7-NEXT:    v_mov_b32_e32 v2, s2
364; GFX7-NEXT:    v_mov_b32_e32 v0, s0
365; GFX7-NEXT:    v_mov_b32_e32 v1, s1
366; GFX7-NEXT:    flat_store_dword v[0:1], v2
367; GFX7-NEXT:    s_endpgm
368;
369; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
370; GFX10-WGP:       ; %bb.0: ; %entry
371; GFX10-WGP-NEXT:    s_clause 0x1
372; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
373; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
374; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
376; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
377; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
378; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
379; GFX10-WGP-NEXT:    s_endpgm
380;
381; GFX10-CU-LABEL: flat_wavefront_unordered_store:
382; GFX10-CU:       ; %bb.0: ; %entry
383; GFX10-CU-NEXT:    s_clause 0x1
384; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
385; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
386; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
388; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
389; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
390; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
391; GFX10-CU-NEXT:    s_endpgm
392;
393; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store:
394; SKIP-CACHE-INV:       ; %bb.0: ; %entry
395; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
396; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
397; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
398; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
399; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
401; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
402; SKIP-CACHE-INV-NEXT:    s_endpgm
403;
404; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
405; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
406; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
407; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
408; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
410; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
411; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
412; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
413;
414; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
415; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
416; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
417; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
418; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
420; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
421; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
422; GFX90A-TGSPLIT-NEXT:    s_endpgm
423;
424;
425    i32 %in, i32* %out) {
426entry:
427  store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
428  ret void
429}
430
431define amdgpu_kernel void @flat_wavefront_monotonic_store(
432; GFX7-LABEL: flat_wavefront_monotonic_store:
433; GFX7:       ; %bb.0: ; %entry
434; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
435; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
436; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX7-NEXT:    v_mov_b32_e32 v2, s2
438; GFX7-NEXT:    v_mov_b32_e32 v0, s0
439; GFX7-NEXT:    v_mov_b32_e32 v1, s1
440; GFX7-NEXT:    flat_store_dword v[0:1], v2
441; GFX7-NEXT:    s_endpgm
442;
443; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
444; GFX10-WGP:       ; %bb.0: ; %entry
445; GFX10-WGP-NEXT:    s_clause 0x1
446; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
447; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
448; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
450; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
451; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
452; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
453; GFX10-WGP-NEXT:    s_endpgm
454;
455; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
456; GFX10-CU:       ; %bb.0: ; %entry
457; GFX10-CU-NEXT:    s_clause 0x1
458; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
459; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
460; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
462; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
463; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
464; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
465; GFX10-CU-NEXT:    s_endpgm
466;
467; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store:
468; SKIP-CACHE-INV:       ; %bb.0: ; %entry
469; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
470; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
471; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
472; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
473; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
475; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
476; SKIP-CACHE-INV-NEXT:    s_endpgm
477;
478; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
479; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
480; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
481; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
482; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
484; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
485; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
486; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
487;
488; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
489; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
490; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
491; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
492; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
494; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
495; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
496; GFX90A-TGSPLIT-NEXT:    s_endpgm
497;
498;
499    i32 %in, i32* %out) {
500entry:
501  store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
502  ret void
503}
504
505define amdgpu_kernel void @flat_wavefront_release_store(
506; GFX7-LABEL: flat_wavefront_release_store:
507; GFX7:       ; %bb.0: ; %entry
508; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
509; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
510; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
511; GFX7-NEXT:    v_mov_b32_e32 v2, s2
512; GFX7-NEXT:    v_mov_b32_e32 v0, s0
513; GFX7-NEXT:    v_mov_b32_e32 v1, s1
514; GFX7-NEXT:    flat_store_dword v[0:1], v2
515; GFX7-NEXT:    s_endpgm
516;
517; GFX10-WGP-LABEL: flat_wavefront_release_store:
518; GFX10-WGP:       ; %bb.0: ; %entry
519; GFX10-WGP-NEXT:    s_clause 0x1
520; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
521; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
522; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
524; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
525; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
526; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
527; GFX10-WGP-NEXT:    s_endpgm
528;
529; GFX10-CU-LABEL: flat_wavefront_release_store:
530; GFX10-CU:       ; %bb.0: ; %entry
531; GFX10-CU-NEXT:    s_clause 0x1
532; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
533; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
534; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
536; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
537; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
538; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
539; GFX10-CU-NEXT:    s_endpgm
540;
541; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store:
542; SKIP-CACHE-INV:       ; %bb.0: ; %entry
543; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
544; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
545; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
546; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
547; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
549; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
550; SKIP-CACHE-INV-NEXT:    s_endpgm
551;
552; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
553; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
554; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
555; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
556; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
558; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
559; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
560; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
561;
562; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
563; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
564; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
565; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
566; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
568; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
569; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
570; GFX90A-TGSPLIT-NEXT:    s_endpgm
571;
572;
573    i32 %in, i32* %out) {
574entry:
575  store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
576  ret void
577}
578
579define amdgpu_kernel void @flat_wavefront_seq_cst_store(
580; GFX7-LABEL: flat_wavefront_seq_cst_store:
581; GFX7:       ; %bb.0: ; %entry
582; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
583; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
584; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX7-NEXT:    v_mov_b32_e32 v2, s2
586; GFX7-NEXT:    v_mov_b32_e32 v0, s0
587; GFX7-NEXT:    v_mov_b32_e32 v1, s1
588; GFX7-NEXT:    flat_store_dword v[0:1], v2
589; GFX7-NEXT:    s_endpgm
590;
591; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
592; GFX10-WGP:       ; %bb.0: ; %entry
593; GFX10-WGP-NEXT:    s_clause 0x1
594; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
595; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
596; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
598; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
599; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
600; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
601; GFX10-WGP-NEXT:    s_endpgm
602;
603; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
604; GFX10-CU:       ; %bb.0: ; %entry
605; GFX10-CU-NEXT:    s_clause 0x1
606; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
607; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
608; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
610; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
611; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
612; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
613; GFX10-CU-NEXT:    s_endpgm
614;
615; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store:
616; SKIP-CACHE-INV:       ; %bb.0: ; %entry
617; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
618; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
619; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
620; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
621; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
622; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
623; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
624; SKIP-CACHE-INV-NEXT:    s_endpgm
625;
626; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
627; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
628; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
629; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
630; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
632; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
633; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
634; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
635;
636; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
637; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
638; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
639; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
640; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
642; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
643; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
644; GFX90A-TGSPLIT-NEXT:    s_endpgm
645;
646;
647    i32 %in, i32* %out) {
648entry:
649  store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
650  ret void
651}
652
653define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
654; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
655; GFX7:       ; %bb.0: ; %entry
656; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
657; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
658; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX7-NEXT:    v_mov_b32_e32 v0, s0
660; GFX7-NEXT:    v_mov_b32_e32 v1, s1
661; GFX7-NEXT:    v_mov_b32_e32 v2, s2
662; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
663; GFX7-NEXT:    s_endpgm
664;
665; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
666; GFX10-WGP:       ; %bb.0: ; %entry
667; GFX10-WGP-NEXT:    s_clause 0x1
668; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
669; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
670; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
672; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
673; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
674; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
675; GFX10-WGP-NEXT:    s_endpgm
676;
677; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
678; GFX10-CU:       ; %bb.0: ; %entry
679; GFX10-CU-NEXT:    s_clause 0x1
680; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
681; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
682; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
684; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
685; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
686; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
687; GFX10-CU-NEXT:    s_endpgm
688;
689; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw:
690; SKIP-CACHE-INV:       ; %bb.0: ; %entry
691; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
692; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
693; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
697; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
698; SKIP-CACHE-INV-NEXT:    s_endpgm
699;
700; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
701; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
702; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
703; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
704; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
706; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
707; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
708; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
709;
710; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
711; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
712; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
713; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
714; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
716; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
717; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
718; GFX90A-TGSPLIT-NEXT:    s_endpgm
719;
720;
721    i32* %out, i32 %in) {
722entry:
723  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic
724  ret void
725}
726
727define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
728; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
729; GFX7:       ; %bb.0: ; %entry
730; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
731; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
732; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX7-NEXT:    v_mov_b32_e32 v0, s0
734; GFX7-NEXT:    v_mov_b32_e32 v1, s1
735; GFX7-NEXT:    v_mov_b32_e32 v2, s2
736; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
737; GFX7-NEXT:    s_endpgm
738;
739; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
740; GFX10-WGP:       ; %bb.0: ; %entry
741; GFX10-WGP-NEXT:    s_clause 0x1
742; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
743; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
744; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
746; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
747; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
748; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
749; GFX10-WGP-NEXT:    s_endpgm
750;
751; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
752; GFX10-CU:       ; %bb.0: ; %entry
753; GFX10-CU-NEXT:    s_clause 0x1
754; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
755; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
756; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
758; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
759; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
760; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
761; GFX10-CU-NEXT:    s_endpgm
762;
763; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw:
764; SKIP-CACHE-INV:       ; %bb.0: ; %entry
765; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
766; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
767; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
768; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
769; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
770; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
771; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
772; SKIP-CACHE-INV-NEXT:    s_endpgm
773;
774; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
775; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
776; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
777; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
778; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
780; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
781; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
782; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
783;
784; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
785; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
786; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
787; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
788; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
790; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
791; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
792; GFX90A-TGSPLIT-NEXT:    s_endpgm
793;
794;
795    i32* %out, i32 %in) {
796entry:
797  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
798  ret void
799}
800
801define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
802; GFX7-LABEL: flat_wavefront_release_atomicrmw:
803; GFX7:       ; %bb.0: ; %entry
804; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
805; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
806; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX7-NEXT:    v_mov_b32_e32 v0, s0
808; GFX7-NEXT:    v_mov_b32_e32 v1, s1
809; GFX7-NEXT:    v_mov_b32_e32 v2, s2
810; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
811; GFX7-NEXT:    s_endpgm
812;
813; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
814; GFX10-WGP:       ; %bb.0: ; %entry
815; GFX10-WGP-NEXT:    s_clause 0x1
816; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
817; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
818; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
820; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
821; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
822; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
823; GFX10-WGP-NEXT:    s_endpgm
824;
825; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
826; GFX10-CU:       ; %bb.0: ; %entry
827; GFX10-CU-NEXT:    s_clause 0x1
828; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
829; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
830; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
832; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
833; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
834; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
835; GFX10-CU-NEXT:    s_endpgm
836;
837; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw:
838; SKIP-CACHE-INV:       ; %bb.0: ; %entry
839; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
840; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
841; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
842; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
843; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
845; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
846; SKIP-CACHE-INV-NEXT:    s_endpgm
847;
848; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
849; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
850; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
851; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
852; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
853; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
854; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
855; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
856; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
857;
858; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
859; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
860; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
861; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
862; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
863; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
864; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
865; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
866; GFX90A-TGSPLIT-NEXT:    s_endpgm
867;
868;
869    i32* %out, i32 %in) {
870entry:
871  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release
872  ret void
873}
874
875define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
876; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
877; GFX7:       ; %bb.0: ; %entry
878; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
879; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
880; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
881; GFX7-NEXT:    v_mov_b32_e32 v0, s0
882; GFX7-NEXT:    v_mov_b32_e32 v1, s1
883; GFX7-NEXT:    v_mov_b32_e32 v2, s2
884; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
885; GFX7-NEXT:    s_endpgm
886;
887; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
888; GFX10-WGP:       ; %bb.0: ; %entry
889; GFX10-WGP-NEXT:    s_clause 0x1
890; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
891; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
892; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
894; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
895; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
896; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
897; GFX10-WGP-NEXT:    s_endpgm
898;
899; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
900; GFX10-CU:       ; %bb.0: ; %entry
901; GFX10-CU-NEXT:    s_clause 0x1
902; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
903; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
904; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
906; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
907; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
908; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
909; GFX10-CU-NEXT:    s_endpgm
910;
911; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw:
912; SKIP-CACHE-INV:       ; %bb.0: ; %entry
913; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
914; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
915; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
919; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
920; SKIP-CACHE-INV-NEXT:    s_endpgm
921;
922; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
923; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
924; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
925; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
926; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
928; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
929; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
930; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
931;
932; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
933; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
934; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
935; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
936; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
938; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
939; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
940; GFX90A-TGSPLIT-NEXT:    s_endpgm
941;
942;
943    i32* %out, i32 %in) {
944entry:
945  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
946  ret void
947}
948
949define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
950; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
951; GFX7:       ; %bb.0: ; %entry
952; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
953; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
954; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX7-NEXT:    v_mov_b32_e32 v0, s0
956; GFX7-NEXT:    v_mov_b32_e32 v1, s1
957; GFX7-NEXT:    v_mov_b32_e32 v2, s2
958; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
959; GFX7-NEXT:    s_endpgm
960;
961; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
962; GFX10-WGP:       ; %bb.0: ; %entry
963; GFX10-WGP-NEXT:    s_clause 0x1
964; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
965; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
966; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
968; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
969; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
970; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
971; GFX10-WGP-NEXT:    s_endpgm
972;
973; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
974; GFX10-CU:       ; %bb.0: ; %entry
975; GFX10-CU-NEXT:    s_clause 0x1
976; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
977; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
978; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
980; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
981; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
982; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
983; GFX10-CU-NEXT:    s_endpgm
984;
985; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw:
986; SKIP-CACHE-INV:       ; %bb.0: ; %entry
987; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
988; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
989; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
991; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
992; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
993; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
994; SKIP-CACHE-INV-NEXT:    s_endpgm
995;
996; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
997; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
998; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
999; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1001; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1002; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1003; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1004; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1005;
1006; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
1007; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1008; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1009; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1010; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1012; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1013; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1014; GFX90A-TGSPLIT-NEXT:    s_endpgm
1015;
1016;
1017    i32* %out, i32 %in) {
1018entry:
1019  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
1020  ret void
1021}
1022
1023define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
1024; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1025; GFX7:       ; %bb.0: ; %entry
1026; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1027; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1028; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1030; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1031; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1032; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1033; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1034; GFX7-NEXT:    flat_store_dword v[0:1], v2
1035; GFX7-NEXT:    s_endpgm
1036;
1037; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1038; GFX10-WGP:       ; %bb.0: ; %entry
1039; GFX10-WGP-NEXT:    s_clause 0x1
1040; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1041; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1042; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1044; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1045; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1046; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1047; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1048; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1049; GFX10-WGP-NEXT:    s_endpgm
1050;
1051; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1052; GFX10-CU:       ; %bb.0: ; %entry
1053; GFX10-CU-NEXT:    s_clause 0x1
1054; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1055; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1056; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1057; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1058; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1059; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1060; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1061; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1062; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1063; GFX10-CU-NEXT:    s_endpgm
1064;
1065; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1066; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1067; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1068; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1069; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1073; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1074; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1075; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1076; SKIP-CACHE-INV-NEXT:    s_endpgm
1077;
1078; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1079; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1080; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1084; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1085; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1086; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1087; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1088; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1089;
1090; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1091; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1092; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1093; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1094; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1095; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1097; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1098; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1099; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1100; GFX90A-TGSPLIT-NEXT:    s_endpgm
1101;
1102;
1103    i32* %out, i32 %in) {
1104entry:
1105  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
1106  store i32 %val, i32* %out, align 4
1107  ret void
1108}
1109
1110define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
1111; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1112; GFX7:       ; %bb.0: ; %entry
1113; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1114; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1115; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1117; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1118; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1119; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1120; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1121; GFX7-NEXT:    flat_store_dword v[0:1], v2
1122; GFX7-NEXT:    s_endpgm
1123;
1124; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1125; GFX10-WGP:       ; %bb.0: ; %entry
1126; GFX10-WGP-NEXT:    s_clause 0x1
1127; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1128; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1129; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1131; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1132; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1133; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1134; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1135; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1136; GFX10-WGP-NEXT:    s_endpgm
1137;
1138; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1139; GFX10-CU:       ; %bb.0: ; %entry
1140; GFX10-CU-NEXT:    s_clause 0x1
1141; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1142; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1143; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1145; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1146; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1147; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1148; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1149; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1150; GFX10-CU-NEXT:    s_endpgm
1151;
1152; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1153; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1154; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1155; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1156; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1157; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1158; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1159; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1160; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1161; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1162; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1163; SKIP-CACHE-INV-NEXT:    s_endpgm
1164;
1165; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1166; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1167; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1168; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1169; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1171; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1172; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1173; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1174; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1175; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1176;
1177; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
1178; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1179; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1180; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1181; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1182; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1183; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1184; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1185; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1186; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1187; GFX90A-TGSPLIT-NEXT:    s_endpgm
1188;
1189;
1190    i32* %out, i32 %in) {
1191entry:
1192  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
1193  store i32 %val, i32* %out, align 4
1194  ret void
1195}
1196
1197define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
1198; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1199; GFX7:       ; %bb.0: ; %entry
1200; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1201; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1202; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1204; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1205; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1206; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1207; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1208; GFX7-NEXT:    flat_store_dword v[0:1], v2
1209; GFX7-NEXT:    s_endpgm
1210;
1211; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1212; GFX10-WGP:       ; %bb.0: ; %entry
1213; GFX10-WGP-NEXT:    s_clause 0x1
1214; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1215; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1216; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1219; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1220; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1221; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1222; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1223; GFX10-WGP-NEXT:    s_endpgm
1224;
1225; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1226; GFX10-CU:       ; %bb.0: ; %entry
1227; GFX10-CU-NEXT:    s_clause 0x1
1228; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1229; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1233; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1234; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1235; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1236; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1237; GFX10-CU-NEXT:    s_endpgm
1238;
1239; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1240; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1241; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1242; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1246; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1247; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1248; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1249; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1250; SKIP-CACHE-INV-NEXT:    s_endpgm
1251;
1252; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1253; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1254; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1255; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1256; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1257; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1258; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1259; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1260; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1261; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1262; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1263;
1264; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
1265; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1266; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1267; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1268; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1269; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1270; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1271; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1272; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1273; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1274; GFX90A-TGSPLIT-NEXT:    s_endpgm
1275;
1276;
1277    i32* %out, i32 %in) {
1278entry:
1279  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
1280  store i32 %val, i32* %out, align 4
1281  ret void
1282}
1283
1284define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
1285; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1286; GFX7:       ; %bb.0: ; %entry
1287; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1288; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1289; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1290; GFX7-NEXT:    s_add_u32 s0, s0, 16
1291; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1292; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1293; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1294; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1295; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1296; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1297; GFX7-NEXT:    s_endpgm
1298;
1299; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1300; GFX10-WGP:       ; %bb.0: ; %entry
1301; GFX10-WGP-NEXT:    s_clause 0x1
1302; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1303; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1304; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1306; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1307; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1308; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1310; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1311; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1312; GFX10-WGP-NEXT:    s_endpgm
1313;
1314; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1315; GFX10-CU:       ; %bb.0: ; %entry
1316; GFX10-CU-NEXT:    s_clause 0x1
1317; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1318; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1319; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1320; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1321; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1322; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1323; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1324; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1325; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1326; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1327; GFX10-CU-NEXT:    s_endpgm
1328;
1329; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1330; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1331; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1332; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1333; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1334; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1335; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1336; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1338; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1339; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1340; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1341; SKIP-CACHE-INV-NEXT:    s_endpgm
1342;
1343; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1344; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1345; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1346; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1347; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1348; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1349; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1350; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1351; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1352;
1353; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
1354; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1355; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1356; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1357; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1359; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1360; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1361; GFX90A-TGSPLIT-NEXT:    s_endpgm
1362;
1363;
1364    i32* %out, i32 %in, i32 %old) {
1365entry:
1366  %gep = getelementptr i32, i32* %out, i32 4
1367  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
1368  ret void
1369}
1370
1371define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
1372; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1373; GFX7:       ; %bb.0: ; %entry
1374; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1375; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1377; GFX7-NEXT:    s_add_u32 s0, s0, 16
1378; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1379; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1380; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1381; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1382; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1383; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1384; GFX7-NEXT:    s_endpgm
1385;
1386; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1387; GFX10-WGP:       ; %bb.0: ; %entry
1388; GFX10-WGP-NEXT:    s_clause 0x1
1389; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1390; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1391; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1392; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1393; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1394; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1395; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1396; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1397; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1398; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1399; GFX10-WGP-NEXT:    s_endpgm
1400;
1401; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1402; GFX10-CU:       ; %bb.0: ; %entry
1403; GFX10-CU-NEXT:    s_clause 0x1
1404; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1405; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1406; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1408; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1409; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1410; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1411; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1412; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1413; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1414; GFX10-CU-NEXT:    s_endpgm
1415;
1416; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1417; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1418; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1419; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1420; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1421; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1422; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1423; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1424; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1425; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1426; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1427; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1428; SKIP-CACHE-INV-NEXT:    s_endpgm
1429;
1430; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1431; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1432; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1433; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1434; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1435; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1436; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1437; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1438; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1439;
1440; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
1441; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1442; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1443; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1444; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1446; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1447; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1448; GFX90A-TGSPLIT-NEXT:    s_endpgm
1449;
1450;
1451    i32* %out, i32 %in, i32 %old) {
1452entry:
1453  %gep = getelementptr i32, i32* %out, i32 4
1454  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
1455  ret void
1456}
1457
1458define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
1459; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1460; GFX7:       ; %bb.0: ; %entry
1461; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1462; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1463; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX7-NEXT:    s_add_u32 s0, s0, 16
1465; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1466; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1467; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1468; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1469; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1470; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1471; GFX7-NEXT:    s_endpgm
1472;
1473; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1474; GFX10-WGP:       ; %bb.0: ; %entry
1475; GFX10-WGP-NEXT:    s_clause 0x1
1476; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1477; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1478; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1479; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1480; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1481; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1482; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1483; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1484; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1485; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1486; GFX10-WGP-NEXT:    s_endpgm
1487;
1488; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1489; GFX10-CU:       ; %bb.0: ; %entry
1490; GFX10-CU-NEXT:    s_clause 0x1
1491; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1492; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1493; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1494; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1495; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1496; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1497; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1499; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1500; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1501; GFX10-CU-NEXT:    s_endpgm
1502;
1503; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1504; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1505; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1506; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1507; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1508; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1509; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1510; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1511; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1512; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1514; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1515; SKIP-CACHE-INV-NEXT:    s_endpgm
1516;
1517; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1518; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1519; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1520; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1521; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1522; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1523; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1524; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1525; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1526;
1527; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
1528; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1529; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1530; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1531; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1532; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1533; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1534; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1535; GFX90A-TGSPLIT-NEXT:    s_endpgm
1536;
1537;
1538    i32* %out, i32 %in, i32 %old) {
1539entry:
1540  %gep = getelementptr i32, i32* %out, i32 4
1541  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
1542  ret void
1543}
1544
1545define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
1546; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1547; GFX7:       ; %bb.0: ; %entry
1548; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1549; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1550; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1551; GFX7-NEXT:    s_add_u32 s0, s0, 16
1552; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1553; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1554; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1555; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1556; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1557; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1558; GFX7-NEXT:    s_endpgm
1559;
1560; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1561; GFX10-WGP:       ; %bb.0: ; %entry
1562; GFX10-WGP-NEXT:    s_clause 0x1
1563; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1564; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1566; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1567; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1568; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1569; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1570; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1571; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1572; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1573; GFX10-WGP-NEXT:    s_endpgm
1574;
1575; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1576; GFX10-CU:       ; %bb.0: ; %entry
1577; GFX10-CU-NEXT:    s_clause 0x1
1578; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1579; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1580; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1581; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1582; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1583; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1584; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1585; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1586; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1587; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1588; GFX10-CU-NEXT:    s_endpgm
1589;
1590; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1591; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1592; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1593; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1594; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1595; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1596; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1598; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1599; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1600; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1601; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1602; SKIP-CACHE-INV-NEXT:    s_endpgm
1603;
1604; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1605; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1606; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1607; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1608; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1610; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1611; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1612; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1613;
1614; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
1615; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1616; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1617; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1618; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1620; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1621; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1622; GFX90A-TGSPLIT-NEXT:    s_endpgm
1623;
1624;
1625    i32* %out, i32 %in, i32 %old) {
1626entry:
1627  %gep = getelementptr i32, i32* %out, i32 4
1628  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
1629  ret void
1630}
1631
1632define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
1633; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1634; GFX7:       ; %bb.0: ; %entry
1635; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1636; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1637; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1638; GFX7-NEXT:    s_add_u32 s0, s0, 16
1639; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1640; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1641; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1642; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1643; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1644; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1645; GFX7-NEXT:    s_endpgm
1646;
1647; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1648; GFX10-WGP:       ; %bb.0: ; %entry
1649; GFX10-WGP-NEXT:    s_clause 0x1
1650; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1651; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1652; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1653; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1654; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1655; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1656; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1657; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1658; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1659; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1660; GFX10-WGP-NEXT:    s_endpgm
1661;
1662; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1663; GFX10-CU:       ; %bb.0: ; %entry
1664; GFX10-CU-NEXT:    s_clause 0x1
1665; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1666; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1667; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1668; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1669; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1670; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1671; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1672; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1673; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1674; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1675; GFX10-CU-NEXT:    s_endpgm
1676;
1677; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1678; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1679; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1680; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1681; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1682; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1683; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1686; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1688; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1689; SKIP-CACHE-INV-NEXT:    s_endpgm
1690;
1691; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1692; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1693; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1694; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1695; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1697; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1698; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1699; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1700;
1701; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
1702; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1703; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1704; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1705; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1706; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1707; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1708; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1709; GFX90A-TGSPLIT-NEXT:    s_endpgm
1710;
1711;
1712    i32* %out, i32 %in, i32 %old) {
1713entry:
1714  %gep = getelementptr i32, i32* %out, i32 4
1715  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
1716  ret void
1717}
1718
1719define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
1720; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1721; GFX7:       ; %bb.0: ; %entry
1722; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1723; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1724; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX7-NEXT:    s_add_u32 s0, s0, 16
1726; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1727; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1728; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1729; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1730; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1731; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1732; GFX7-NEXT:    s_endpgm
1733;
1734; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1735; GFX10-WGP:       ; %bb.0: ; %entry
1736; GFX10-WGP-NEXT:    s_clause 0x1
1737; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1738; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1739; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1741; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1742; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1743; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1744; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1745; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1746; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1747; GFX10-WGP-NEXT:    s_endpgm
1748;
1749; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1750; GFX10-CU:       ; %bb.0: ; %entry
1751; GFX10-CU-NEXT:    s_clause 0x1
1752; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1753; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1754; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1756; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1757; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1758; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1759; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1760; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1761; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1762; GFX10-CU-NEXT:    s_endpgm
1763;
1764; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1765; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1766; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1767; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1768; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1769; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1770; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1771; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1772; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1775; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1776; SKIP-CACHE-INV-NEXT:    s_endpgm
1777;
1778; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1779; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1780; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1781; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1782; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1784; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1785; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1786; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1787;
1788; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
1789; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1790; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1791; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1792; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1794; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1795; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1796; GFX90A-TGSPLIT-NEXT:    s_endpgm
1797;
1798;
1799    i32* %out, i32 %in, i32 %old) {
1800entry:
1801  %gep = getelementptr i32, i32* %out, i32 4
1802  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
1803  ret void
1804}
1805
1806define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
1807; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
1808; GFX7:       ; %bb.0: ; %entry
1809; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1810; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1811; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1812; GFX7-NEXT:    s_add_u32 s0, s0, 16
1813; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1814; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1815; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1816; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1817; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1818; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1819; GFX7-NEXT:    s_endpgm
1820;
1821; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
1822; GFX10-WGP:       ; %bb.0: ; %entry
1823; GFX10-WGP-NEXT:    s_clause 0x1
1824; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1825; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1826; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1827; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1828; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1829; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1830; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1831; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1832; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1833; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1834; GFX10-WGP-NEXT:    s_endpgm
1835;
1836; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
1837; GFX10-CU:       ; %bb.0: ; %entry
1838; GFX10-CU-NEXT:    s_clause 0x1
1839; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1840; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1841; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1843; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1844; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1845; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1846; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1847; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1848; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1849; GFX10-CU-NEXT:    s_endpgm
1850;
1851; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg:
1852; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1853; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1854; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1855; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1856; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1857; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1862; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1863; SKIP-CACHE-INV-NEXT:    s_endpgm
1864;
1865; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
1866; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1867; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1868; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1870; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1871; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1872; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1873; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1874;
1875; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
1876; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1877; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1878; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1879; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1880; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1881; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1882; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1883; GFX90A-TGSPLIT-NEXT:    s_endpgm
1884;
1885;
1886    i32* %out, i32 %in, i32 %old) {
1887entry:
1888  %gep = getelementptr i32, i32* %out, i32 4
1889  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
1890  ret void
1891}
1892
1893define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
1894; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1895; GFX7:       ; %bb.0: ; %entry
1896; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1897; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1898; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1899; GFX7-NEXT:    s_add_u32 s0, s0, 16
1900; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1901; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1902; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1903; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1904; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1905; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1906; GFX7-NEXT:    s_endpgm
1907;
1908; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1909; GFX10-WGP:       ; %bb.0: ; %entry
1910; GFX10-WGP-NEXT:    s_clause 0x1
1911; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1912; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1913; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1914; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1915; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1916; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1917; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1918; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1919; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1920; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1921; GFX10-WGP-NEXT:    s_endpgm
1922;
1923; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1924; GFX10-CU:       ; %bb.0: ; %entry
1925; GFX10-CU-NEXT:    s_clause 0x1
1926; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1927; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1928; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1930; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1931; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1932; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1933; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1934; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1935; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1936; GFX10-CU-NEXT:    s_endpgm
1937;
1938; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1939; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1940; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1941; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1942; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1943; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1944; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1945; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1946; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1947; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1948; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1949; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1950; SKIP-CACHE-INV-NEXT:    s_endpgm
1951;
1952; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1953; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1954; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1955; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1956; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1958; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1959; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1960; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1961;
1962; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
1963; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1964; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1965; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1966; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1967; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1968; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1969; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1970; GFX90A-TGSPLIT-NEXT:    s_endpgm
1971;
1972;
1973    i32* %out, i32 %in, i32 %old) {
1974entry:
1975  %gep = getelementptr i32, i32* %out, i32 4
1976  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
1977  ret void
1978}
1979
1980define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
1981; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
1982; GFX7:       ; %bb.0: ; %entry
1983; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1984; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1985; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1986; GFX7-NEXT:    s_add_u32 s0, s0, 16
1987; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1988; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1989; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1990; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1991; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1992; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1993; GFX7-NEXT:    s_endpgm
1994;
1995; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
1996; GFX10-WGP:       ; %bb.0: ; %entry
1997; GFX10-WGP-NEXT:    s_clause 0x1
1998; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1999; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2000; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2002; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2003; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2004; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2005; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2006; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2007; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2008; GFX10-WGP-NEXT:    s_endpgm
2009;
2010; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
2011; GFX10-CU:       ; %bb.0: ; %entry
2012; GFX10-CU-NEXT:    s_clause 0x1
2013; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2014; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2015; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2016; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2017; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2018; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2019; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2020; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2021; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2022; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2023; GFX10-CU-NEXT:    s_endpgm
2024;
2025; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
2026; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2027; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2028; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2029; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2030; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2031; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2032; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2033; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2034; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2036; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2037; SKIP-CACHE-INV-NEXT:    s_endpgm
2038;
2039; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
2040; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2041; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2042; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2043; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2044; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2045; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2046; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2047; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2048;
2049; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
2050; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2051; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2052; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2053; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2054; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2055; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2056; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2057; GFX90A-TGSPLIT-NEXT:    s_endpgm
2058;
2059;
2060    i32* %out, i32 %in, i32 %old) {
2061entry:
2062  %gep = getelementptr i32, i32* %out, i32 4
2063  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
2064  ret void
2065}
2066
2067define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
2068; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2069; GFX7:       ; %bb.0: ; %entry
2070; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2071; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2072; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX7-NEXT:    s_add_u32 s0, s0, 16
2074; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2075; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2076; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2077; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2078; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2079; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2080; GFX7-NEXT:    s_endpgm
2081;
2082; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2083; GFX10-WGP:       ; %bb.0: ; %entry
2084; GFX10-WGP-NEXT:    s_clause 0x1
2085; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2086; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2087; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2089; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2090; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2091; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2092; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2093; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2094; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2095; GFX10-WGP-NEXT:    s_endpgm
2096;
2097; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2098; GFX10-CU:       ; %bb.0: ; %entry
2099; GFX10-CU-NEXT:    s_clause 0x1
2100; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2101; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2102; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2104; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2105; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2106; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2107; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2108; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2109; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2110; GFX10-CU-NEXT:    s_endpgm
2111;
2112; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2113; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2114; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2115; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2116; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2117; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2118; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2119; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2122; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2123; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2124; SKIP-CACHE-INV-NEXT:    s_endpgm
2125;
2126; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2127; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2128; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2129; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2130; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2131; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2132; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2133; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2134; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2135;
2136; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
2137; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2138; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2139; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2140; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2141; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2142; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2143; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2144; GFX90A-TGSPLIT-NEXT:    s_endpgm
2145;
2146;
2147    i32* %out, i32 %in, i32 %old) {
2148entry:
2149  %gep = getelementptr i32, i32* %out, i32 4
2150  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
2151  ret void
2152}
2153
2154define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
2155; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2156; GFX7:       ; %bb.0: ; %entry
2157; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2158; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2159; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX7-NEXT:    s_add_u32 s4, s0, 16
2161; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2162; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2163; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2164; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2165; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2166; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2167; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2168; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2169; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2170; GFX7-NEXT:    flat_store_dword v[0:1], v2
2171; GFX7-NEXT:    s_endpgm
2172;
2173; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2174; GFX10-WGP:       ; %bb.0: ; %entry
2175; GFX10-WGP-NEXT:    s_clause 0x1
2176; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2177; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2178; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2179; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2180; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2181; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2182; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2183; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2184; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2185; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2186; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2187; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2188; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2189; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2190; GFX10-WGP-NEXT:    s_endpgm
2191;
2192; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2193; GFX10-CU:       ; %bb.0: ; %entry
2194; GFX10-CU-NEXT:    s_clause 0x1
2195; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2196; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2197; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2199; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2200; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2201; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2202; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2203; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2204; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2205; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2206; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2207; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2208; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2209; GFX10-CU-NEXT:    s_endpgm
2210;
2211; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2212; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2213; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2214; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2215; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2216; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2217; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2218; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2219; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2220; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2222; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2225; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2226; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2227; SKIP-CACHE-INV-NEXT:    s_endpgm
2228;
2229; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2230; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2231; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2232; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2233; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2235; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2236; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2237; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2238; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2239; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2240;
2241; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
2242; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2243; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2244; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2245; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2246; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2247; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2248; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2249; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2250; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2251; GFX90A-TGSPLIT-NEXT:    s_endpgm
2252;
2253;
2254    i32* %out, i32 %in, i32 %old) {
2255entry:
2256  %gep = getelementptr i32, i32* %out, i32 4
2257  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
2258  %val0 = extractvalue { i32, i1 } %val, 0
2259  store i32 %val0, i32* %out, align 4
2260  ret void
2261}
2262
2263define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
2264; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2265; GFX7:       ; %bb.0: ; %entry
2266; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2267; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2268; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2269; GFX7-NEXT:    s_add_u32 s4, s0, 16
2270; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2271; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2272; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2273; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2274; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2275; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2276; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2277; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2278; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2279; GFX7-NEXT:    flat_store_dword v[0:1], v2
2280; GFX7-NEXT:    s_endpgm
2281;
2282; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2283; GFX10-WGP:       ; %bb.0: ; %entry
2284; GFX10-WGP-NEXT:    s_clause 0x1
2285; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2286; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2287; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2288; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2289; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2290; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2291; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2292; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2293; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2294; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2295; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2296; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2297; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2298; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2299; GFX10-WGP-NEXT:    s_endpgm
2300;
2301; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2302; GFX10-CU:       ; %bb.0: ; %entry
2303; GFX10-CU-NEXT:    s_clause 0x1
2304; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2305; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2306; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2308; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2309; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2310; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2311; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2312; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2313; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2314; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2315; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2316; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2317; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2318; GFX10-CU-NEXT:    s_endpgm
2319;
2320; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2321; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2322; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2323; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2324; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2325; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2326; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2329; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2331; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2333; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2334; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2335; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2336; SKIP-CACHE-INV-NEXT:    s_endpgm
2337;
2338; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2339; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2340; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2341; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2342; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2343; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2344; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2345; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2346; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2347; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2348; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2349;
2350; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
2351; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2352; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2354; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2355; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2356; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2357; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2358; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2359; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2360; GFX90A-TGSPLIT-NEXT:    s_endpgm
2361;
2362;
2363    i32* %out, i32 %in, i32 %old) {
2364entry:
2365  %gep = getelementptr i32, i32* %out, i32 4
2366  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
2367  %val0 = extractvalue { i32, i1 } %val, 0
2368  store i32 %val0, i32* %out, align 4
2369  ret void
2370}
2371
2372define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
2373; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2374; GFX7:       ; %bb.0: ; %entry
2375; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2376; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2377; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2378; GFX7-NEXT:    s_add_u32 s4, s0, 16
2379; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2380; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2381; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2382; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2383; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2384; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2385; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2386; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2387; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2388; GFX7-NEXT:    flat_store_dword v[0:1], v2
2389; GFX7-NEXT:    s_endpgm
2390;
2391; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2392; GFX10-WGP:       ; %bb.0: ; %entry
2393; GFX10-WGP-NEXT:    s_clause 0x1
2394; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2395; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2396; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2397; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2398; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2399; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2400; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2401; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2402; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2403; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2404; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2405; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2406; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2407; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2408; GFX10-WGP-NEXT:    s_endpgm
2409;
2410; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2411; GFX10-CU:       ; %bb.0: ; %entry
2412; GFX10-CU-NEXT:    s_clause 0x1
2413; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2414; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2415; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2416; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2417; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2418; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2419; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2420; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2421; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2422; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2423; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2424; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2425; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2426; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2427; GFX10-CU-NEXT:    s_endpgm
2428;
2429; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2430; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2431; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2432; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2433; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2434; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2435; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2437; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2438; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2439; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2440; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2443; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2444; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2445; SKIP-CACHE-INV-NEXT:    s_endpgm
2446;
2447; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2448; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2449; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2450; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2451; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2452; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2453; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2454; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2455; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2456; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2457; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2458;
2459; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
2460; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2461; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2462; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2463; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2464; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2465; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2466; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2467; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2468; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2469; GFX90A-TGSPLIT-NEXT:    s_endpgm
2470;
2471;
2472    i32* %out, i32 %in, i32 %old) {
2473entry:
2474  %gep = getelementptr i32, i32* %out, i32 4
2475  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
2476  %val0 = extractvalue { i32, i1 } %val, 0
2477  store i32 %val0, i32* %out, align 4
2478  ret void
2479}
2480
2481define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
2482; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2483; GFX7:       ; %bb.0: ; %entry
2484; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2485; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2486; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2487; GFX7-NEXT:    s_add_u32 s4, s0, 16
2488; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2489; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2490; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2491; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2492; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2493; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2494; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2495; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2496; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2497; GFX7-NEXT:    flat_store_dword v[0:1], v2
2498; GFX7-NEXT:    s_endpgm
2499;
2500; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2501; GFX10-WGP:       ; %bb.0: ; %entry
2502; GFX10-WGP-NEXT:    s_clause 0x1
2503; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2504; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2505; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2507; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2508; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2509; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2510; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2511; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2512; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2513; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2514; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2515; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2516; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2517; GFX10-WGP-NEXT:    s_endpgm
2518;
2519; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2520; GFX10-CU:       ; %bb.0: ; %entry
2521; GFX10-CU-NEXT:    s_clause 0x1
2522; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2523; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2524; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2525; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2526; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2527; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2528; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2529; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2530; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2531; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2532; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2533; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2534; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2535; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2536; GFX10-CU-NEXT:    s_endpgm
2537;
2538; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2539; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2540; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2541; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2542; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2543; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2544; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2545; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2546; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2547; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2549; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2550; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2551; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2552; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2553; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2554; SKIP-CACHE-INV-NEXT:    s_endpgm
2555;
2556; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2557; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2558; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2559; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2560; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2561; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2562; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2563; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2564; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2565; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2566; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2567;
2568; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
2569; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2570; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2571; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2572; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2573; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2574; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2575; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2576; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2577; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2578; GFX90A-TGSPLIT-NEXT:    s_endpgm
2579;
2580;
2581    i32* %out, i32 %in, i32 %old) {
2582entry:
2583  %gep = getelementptr i32, i32* %out, i32 4
2584  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
2585  %val0 = extractvalue { i32, i1 } %val, 0
2586  store i32 %val0, i32* %out, align 4
2587  ret void
2588}
2589
2590define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
2591; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2592; GFX7:       ; %bb.0: ; %entry
2593; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2594; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2595; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2596; GFX7-NEXT:    s_add_u32 s4, s0, 16
2597; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2598; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2599; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2600; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2601; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2602; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2603; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2604; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2605; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2606; GFX7-NEXT:    flat_store_dword v[0:1], v2
2607; GFX7-NEXT:    s_endpgm
2608;
2609; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2610; GFX10-WGP:       ; %bb.0: ; %entry
2611; GFX10-WGP-NEXT:    s_clause 0x1
2612; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2613; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2614; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2615; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2616; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2617; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2618; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2619; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2620; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2621; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2622; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2623; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2624; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2625; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2626; GFX10-WGP-NEXT:    s_endpgm
2627;
2628; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2629; GFX10-CU:       ; %bb.0: ; %entry
2630; GFX10-CU-NEXT:    s_clause 0x1
2631; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2632; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2633; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2634; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2635; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2636; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2637; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2638; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2639; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2640; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2641; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2642; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2643; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2644; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2645; GFX10-CU-NEXT:    s_endpgm
2646;
2647; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2648; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2649; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2650; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2651; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2652; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2653; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2654; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2655; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2656; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2657; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2658; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2659; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2661; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2662; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2663; SKIP-CACHE-INV-NEXT:    s_endpgm
2664;
2665; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2666; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2667; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2668; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2669; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2670; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2671; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2672; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2673; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2674; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2675; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2676;
2677; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
2678; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2679; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2680; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2681; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2682; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2683; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2684; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2685; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2686; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2687; GFX90A-TGSPLIT-NEXT:    s_endpgm
2688;
2689;
2690    i32* %out, i32 %in, i32 %old) {
2691entry:
2692  %gep = getelementptr i32, i32* %out, i32 4
2693  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
2694  %val0 = extractvalue { i32, i1 } %val, 0
2695  store i32 %val0, i32* %out, align 4
2696  ret void
2697}
2698
2699define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
2700; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2701; GFX7:       ; %bb.0: ; %entry
2702; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2703; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2704; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2705; GFX7-NEXT:    s_add_u32 s4, s0, 16
2706; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2707; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2708; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2709; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2710; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2711; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2712; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2713; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2714; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2715; GFX7-NEXT:    flat_store_dword v[0:1], v2
2716; GFX7-NEXT:    s_endpgm
2717;
2718; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2719; GFX10-WGP:       ; %bb.0: ; %entry
2720; GFX10-WGP-NEXT:    s_clause 0x1
2721; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2722; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2723; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2724; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2725; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2726; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2727; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2728; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2729; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2730; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2731; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2732; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2733; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2734; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2735; GFX10-WGP-NEXT:    s_endpgm
2736;
2737; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2738; GFX10-CU:       ; %bb.0: ; %entry
2739; GFX10-CU-NEXT:    s_clause 0x1
2740; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2741; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2742; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2743; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2744; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2745; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2746; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2747; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2748; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2749; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2750; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2751; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2752; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2753; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2754; GFX10-CU-NEXT:    s_endpgm
2755;
2756; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2757; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2758; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2759; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2760; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2761; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2762; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2764; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2767; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2768; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2769; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2770; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2771; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2772; SKIP-CACHE-INV-NEXT:    s_endpgm
2773;
2774; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2775; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2776; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2777; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2778; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2779; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2780; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2781; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2782; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2783; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2784; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2785;
2786; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
2787; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2788; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2789; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2790; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2791; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2792; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2793; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2794; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2795; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2796; GFX90A-TGSPLIT-NEXT:    s_endpgm
2797;
2798;
2799    i32* %out, i32 %in, i32 %old) {
2800entry:
2801  %gep = getelementptr i32, i32* %out, i32 4
2802  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
2803  %val0 = extractvalue { i32, i1 } %val, 0
2804  store i32 %val0, i32* %out, align 4
2805  ret void
2806}
2807
2808define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
2809; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2810; GFX7:       ; %bb.0: ; %entry
2811; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2812; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2813; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2814; GFX7-NEXT:    s_add_u32 s4, s0, 16
2815; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2816; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2817; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2818; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2819; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2820; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2821; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2822; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2823; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2824; GFX7-NEXT:    flat_store_dword v[0:1], v2
2825; GFX7-NEXT:    s_endpgm
2826;
2827; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2828; GFX10-WGP:       ; %bb.0: ; %entry
2829; GFX10-WGP-NEXT:    s_clause 0x1
2830; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2831; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2832; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2833; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2834; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2835; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2836; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2837; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2838; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2839; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2840; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2841; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2842; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2843; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2844; GFX10-WGP-NEXT:    s_endpgm
2845;
2846; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2847; GFX10-CU:       ; %bb.0: ; %entry
2848; GFX10-CU-NEXT:    s_clause 0x1
2849; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2850; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2851; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2852; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2853; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2854; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2855; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2856; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2857; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2858; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2859; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2860; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2861; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2862; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2863; GFX10-CU-NEXT:    s_endpgm
2864;
2865; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2866; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2867; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2868; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2869; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2870; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2871; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2872; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2873; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2874; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2876; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2877; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2879; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2880; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2881; SKIP-CACHE-INV-NEXT:    s_endpgm
2882;
2883; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2884; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2885; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2886; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2887; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2888; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2889; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2890; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2891; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2892; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2893; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2894;
2895; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
2896; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2897; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2898; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2899; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2901; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2902; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2903; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2904; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2905; GFX90A-TGSPLIT-NEXT:    s_endpgm
2906;
2907;
2908    i32* %out, i32 %in, i32 %old) {
2909entry:
2910  %gep = getelementptr i32, i32* %out, i32 4
2911  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
2912  %val0 = extractvalue { i32, i1 } %val, 0
2913  store i32 %val0, i32* %out, align 4
2914  ret void
2915}
2916
2917define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
2918; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
2919; GFX7:       ; %bb.0: ; %entry
2920; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2921; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2922; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2923; GFX7-NEXT:    s_add_u32 s4, s0, 16
2924; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2925; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2926; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2927; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2928; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2929; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2930; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2931; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2932; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2933; GFX7-NEXT:    flat_store_dword v[0:1], v2
2934; GFX7-NEXT:    s_endpgm
2935;
2936; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
2937; GFX10-WGP:       ; %bb.0: ; %entry
2938; GFX10-WGP-NEXT:    s_clause 0x1
2939; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2940; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2941; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2943; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2944; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2945; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2946; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2947; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2948; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2949; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2950; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2951; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2952; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2953; GFX10-WGP-NEXT:    s_endpgm
2954;
2955; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
2956; GFX10-CU:       ; %bb.0: ; %entry
2957; GFX10-CU-NEXT:    s_clause 0x1
2958; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2959; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2960; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2961; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2962; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2963; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2964; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2965; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2966; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2967; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2968; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2969; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2970; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2971; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2972; GFX10-CU-NEXT:    s_endpgm
2973;
2974; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
2975; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2976; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2977; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2978; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2979; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2980; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2981; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2985; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2986; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2988; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2989; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2990; SKIP-CACHE-INV-NEXT:    s_endpgm
2991;
2992; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
2993; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2994; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2995; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2996; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2997; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2998; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2999; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3001; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3002; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3003;
3004; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
3005; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3006; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3007; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3008; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3009; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3010; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3011; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3012; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3013; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3014; GFX90A-TGSPLIT-NEXT:    s_endpgm
3015;
3016;
3017    i32* %out, i32 %in, i32 %old) {
3018entry:
3019  %gep = getelementptr i32, i32* %out, i32 4
3020  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
3021  %val0 = extractvalue { i32, i1 } %val, 0
3022  store i32 %val0, i32* %out, align 4
3023  ret void
3024}
3025
3026define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
3027; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
3028; GFX7:       ; %bb.0: ; %entry
3029; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3030; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3031; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3032; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3033; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3034; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3035; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3036; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3037; GFX7-NEXT:    flat_store_dword v[2:3], v0
3038; GFX7-NEXT:    s_endpgm
3039;
3040; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
3041; GFX10-WGP:       ; %bb.0: ; %entry
3042; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3043; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3044; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3045; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3046; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
3047; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
3048; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
3049; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3050; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3051; GFX10-WGP-NEXT:    s_endpgm
3052;
3053; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
3054; GFX10-CU:       ; %bb.0: ; %entry
3055; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3056; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3057; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3058; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3059; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
3060; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
3061; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
3062; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3063; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3064; GFX10-CU-NEXT:    s_endpgm
3065;
3066; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load:
3067; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3068; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3069; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3072; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
3073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3074; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
3075; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3076; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
3077; SKIP-CACHE-INV-NEXT:    s_endpgm
3078;
3079; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
3080; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3083; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3084; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3085; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3086; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3087; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3088; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3089; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3090; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3091;
3092; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
3093; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3094; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3095; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3097; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3098; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3099; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3100; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3101; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3102; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3103; GFX90A-TGSPLIT-NEXT:    s_endpgm
3104;
3105;
3106    i32* %in, i32* %out) {
3107entry:
3108  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
3109  store i32 %val, i32* %out
3110  ret void
3111}
3112
3113define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
3114; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
3115; GFX7:       ; %bb.0: ; %entry
3116; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3117; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3118; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3119; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3120; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3121; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3122; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3123; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3124; GFX7-NEXT:    flat_store_dword v[2:3], v0
3125; GFX7-NEXT:    s_endpgm
3126;
3127; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
3128; GFX10-WGP:       ; %bb.0: ; %entry
3129; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3130; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3131; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3132; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3133; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
3134; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
3135; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
3136; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3137; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3138; GFX10-WGP-NEXT:    s_endpgm
3139;
3140; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
3141; GFX10-CU:       ; %bb.0: ; %entry
3142; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3143; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3145; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3146; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
3147; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
3148; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
3149; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3150; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3151; GFX10-CU-NEXT:    s_endpgm
3152;
3153; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load:
3154; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3155; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3156; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3157; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3158; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3159; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
3160; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3161; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
3162; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3163; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
3164; SKIP-CACHE-INV-NEXT:    s_endpgm
3165;
3166; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
3167; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3168; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3169; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3170; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3171; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3172; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3173; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3174; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3175; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3176; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3177; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3178;
3179; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
3180; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3181; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3182; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3183; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3184; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3185; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3186; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3187; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3188; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3189; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3190; GFX90A-TGSPLIT-NEXT:    s_endpgm
3191;
3192;
3193    i32* %in, i32* %out) {
3194entry:
3195  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
3196  store i32 %val, i32* %out
3197  ret void
3198}
3199
3200define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
3201; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
3202; GFX7:       ; %bb.0: ; %entry
3203; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3204; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3205; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3206; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3207; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3208; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3209; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3210; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3211; GFX7-NEXT:    flat_store_dword v[2:3], v0
3212; GFX7-NEXT:    s_endpgm
3213;
3214; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
3215; GFX10-WGP:       ; %bb.0: ; %entry
3216; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3217; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3218; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3219; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3220; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
3221; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
3222; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
3223; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3224; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3225; GFX10-WGP-NEXT:    s_endpgm
3226;
3227; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
3228; GFX10-CU:       ; %bb.0: ; %entry
3229; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3233; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
3234; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
3235; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
3236; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3237; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3238; GFX10-CU-NEXT:    s_endpgm
3239;
3240; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load:
3241; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3242; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3246; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
3247; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3248; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
3249; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3250; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
3251; SKIP-CACHE-INV-NEXT:    s_endpgm
3252;
3253; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
3254; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3255; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3256; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3257; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3258; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3259; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3260; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3261; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3262; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3263; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3264; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3265;
3266; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
3267; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3268; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3269; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3270; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3271; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3272; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3273; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3274; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3275; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3276; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3277; GFX90A-TGSPLIT-NEXT:    s_endpgm
3278;
3279;
3280    i32* %in, i32* %out) {
3281entry:
3282  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
3283  store i32 %val, i32* %out
3284  ret void
3285}
3286
3287define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
3288; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
3289; GFX7:       ; %bb.0: ; %entry
3290; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3291; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3292; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3293; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3294; GFX7-NEXT:    flat_load_dword v0, v[0:1]
3295; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3296; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3297; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3298; GFX7-NEXT:    flat_store_dword v[2:3], v0
3299; GFX7-NEXT:    s_endpgm
3300;
3301; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
3302; GFX10-WGP:       ; %bb.0: ; %entry
3303; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3304; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3305; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3306; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3307; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
3308; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
3309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
3310; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3311; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3312; GFX10-WGP-NEXT:    s_endpgm
3313;
3314; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
3315; GFX10-CU:       ; %bb.0: ; %entry
3316; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3317; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3318; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3319; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3320; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
3321; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
3322; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
3323; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3324; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3325; GFX10-CU-NEXT:    s_endpgm
3326;
3327; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load:
3328; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3329; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3330; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3333; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
3334; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
3336; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3337; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
3338; SKIP-CACHE-INV-NEXT:    s_endpgm
3339;
3340; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
3341; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3342; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3343; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3345; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3346; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3347; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3348; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3349; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3350; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3351; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3352;
3353; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
3354; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3355; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3356; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
3358; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
3359; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
3360; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3361; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
3362; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3363; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
3364; GFX90A-TGSPLIT-NEXT:    s_endpgm
3365;
3366;
3367    i32* %in, i32* %out) {
3368entry:
3369  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
3370  store i32 %val, i32* %out
3371  ret void
3372}
3373
3374define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
3375; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
3376; GFX7:       ; %bb.0: ; %entry
3377; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3378; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3379; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3380; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3381; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3382; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3383; GFX7-NEXT:    flat_store_dword v[0:1], v2
3384; GFX7-NEXT:    s_endpgm
3385;
3386; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
3387; GFX10-WGP:       ; %bb.0: ; %entry
3388; GFX10-WGP-NEXT:    s_clause 0x1
3389; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3390; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3391; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3392; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3393; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3394; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3395; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3396; GFX10-WGP-NEXT:    s_endpgm
3397;
3398; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
3399; GFX10-CU:       ; %bb.0: ; %entry
3400; GFX10-CU-NEXT:    s_clause 0x1
3401; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3402; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3403; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3404; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3405; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3406; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3407; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3408; GFX10-CU-NEXT:    s_endpgm
3409;
3410; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store:
3411; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3412; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
3413; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3414; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3415; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3418; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3419; SKIP-CACHE-INV-NEXT:    s_endpgm
3420;
3421; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
3422; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3423; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3424; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3425; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3426; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3427; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3428; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3429; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3430;
3431; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
3432; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3433; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3434; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3435; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3436; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3437; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3438; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3439; GFX90A-TGSPLIT-NEXT:    s_endpgm
3440;
3441;
3442    i32 %in, i32* %out) {
3443entry:
3444  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
3445  ret void
3446}
3447
3448define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
3449; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
3450; GFX7:       ; %bb.0: ; %entry
3451; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3452; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3454; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3455; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3456; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3457; GFX7-NEXT:    flat_store_dword v[0:1], v2
3458; GFX7-NEXT:    s_endpgm
3459;
3460; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
3461; GFX10-WGP:       ; %bb.0: ; %entry
3462; GFX10-WGP-NEXT:    s_clause 0x1
3463; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3464; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3465; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3466; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3467; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3468; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3469; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3470; GFX10-WGP-NEXT:    s_endpgm
3471;
3472; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
3473; GFX10-CU:       ; %bb.0: ; %entry
3474; GFX10-CU-NEXT:    s_clause 0x1
3475; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3476; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3477; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3478; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3479; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3480; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3481; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3482; GFX10-CU-NEXT:    s_endpgm
3483;
3484; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store:
3485; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3486; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
3487; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3488; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3492; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3493; SKIP-CACHE-INV-NEXT:    s_endpgm
3494;
3495; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
3496; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3498; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3500; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3501; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3502; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3503; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3504;
3505; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
3506; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3507; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3508; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3511; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3512; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3513; GFX90A-TGSPLIT-NEXT:    s_endpgm
3514;
3515;
3516    i32 %in, i32* %out) {
3517entry:
3518  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
3519  ret void
3520}
3521
3522define amdgpu_kernel void @flat_wavefront_one_as_release_store(
3523; GFX7-LABEL: flat_wavefront_one_as_release_store:
3524; GFX7:       ; %bb.0: ; %entry
3525; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3526; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3527; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3529; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3530; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3531; GFX7-NEXT:    flat_store_dword v[0:1], v2
3532; GFX7-NEXT:    s_endpgm
3533;
3534; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
3535; GFX10-WGP:       ; %bb.0: ; %entry
3536; GFX10-WGP-NEXT:    s_clause 0x1
3537; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3538; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3539; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3540; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3541; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3542; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3543; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3544; GFX10-WGP-NEXT:    s_endpgm
3545;
3546; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
3547; GFX10-CU:       ; %bb.0: ; %entry
3548; GFX10-CU-NEXT:    s_clause 0x1
3549; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3550; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3551; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3552; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3553; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3554; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3555; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3556; GFX10-CU-NEXT:    s_endpgm
3557;
3558; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store:
3559; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3560; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
3561; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3562; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3566; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3567; SKIP-CACHE-INV-NEXT:    s_endpgm
3568;
3569; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
3570; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3571; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3572; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3573; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3574; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3575; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3576; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3577; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3578;
3579; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
3580; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3581; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3582; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3583; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3585; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3586; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3587; GFX90A-TGSPLIT-NEXT:    s_endpgm
3588;
3589;
3590    i32 %in, i32* %out) {
3591entry:
3592  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
3593  ret void
3594}
3595
3596define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
3597; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
3598; GFX7:       ; %bb.0: ; %entry
3599; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3600; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3601; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3602; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3603; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3604; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3605; GFX7-NEXT:    flat_store_dword v[0:1], v2
3606; GFX7-NEXT:    s_endpgm
3607;
3608; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
3609; GFX10-WGP:       ; %bb.0: ; %entry
3610; GFX10-WGP-NEXT:    s_clause 0x1
3611; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3612; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3613; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3614; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3615; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3616; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3617; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3618; GFX10-WGP-NEXT:    s_endpgm
3619;
3620; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
3621; GFX10-CU:       ; %bb.0: ; %entry
3622; GFX10-CU-NEXT:    s_clause 0x1
3623; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3624; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3625; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3626; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3627; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3628; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3629; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3630; GFX10-CU-NEXT:    s_endpgm
3631;
3632; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store:
3633; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3634; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
3635; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3636; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3639; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3640; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3641; SKIP-CACHE-INV-NEXT:    s_endpgm
3642;
3643; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
3644; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3645; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3646; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3647; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3649; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3650; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3651; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3652;
3653; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
3654; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3655; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
3656; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3657; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3658; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3659; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3660; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3661; GFX90A-TGSPLIT-NEXT:    s_endpgm
3662;
3663;
3664    i32 %in, i32* %out) {
3665entry:
3666  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
3667  ret void
3668}
3669
3670define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
3671; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3672; GFX7:       ; %bb.0: ; %entry
3673; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3674; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3675; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3677; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3678; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3679; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3680; GFX7-NEXT:    s_endpgm
3681;
3682; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3683; GFX10-WGP:       ; %bb.0: ; %entry
3684; GFX10-WGP-NEXT:    s_clause 0x1
3685; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3686; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3687; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3688; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3689; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3690; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3691; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3692; GFX10-WGP-NEXT:    s_endpgm
3693;
3694; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3695; GFX10-CU:       ; %bb.0: ; %entry
3696; GFX10-CU-NEXT:    s_clause 0x1
3697; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3698; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3699; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3700; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3701; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3702; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3703; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3704; GFX10-CU-NEXT:    s_endpgm
3705;
3706; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3707; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3708; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3709; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3710; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3711; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3714; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3715; SKIP-CACHE-INV-NEXT:    s_endpgm
3716;
3717; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3718; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3719; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3720; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3721; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3722; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3723; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3724; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3725; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3726;
3727; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
3728; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3729; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3730; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3731; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3732; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3733; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3734; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3735; GFX90A-TGSPLIT-NEXT:    s_endpgm
3736;
3737;
3738    i32* %out, i32 %in) {
3739entry:
3740  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic
3741  ret void
3742}
3743
3744define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
3745; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3746; GFX7:       ; %bb.0: ; %entry
3747; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3748; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3749; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3751; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3752; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3753; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3754; GFX7-NEXT:    s_endpgm
3755;
3756; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3757; GFX10-WGP:       ; %bb.0: ; %entry
3758; GFX10-WGP-NEXT:    s_clause 0x1
3759; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3760; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3761; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3762; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3763; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3764; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3765; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3766; GFX10-WGP-NEXT:    s_endpgm
3767;
3768; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3769; GFX10-CU:       ; %bb.0: ; %entry
3770; GFX10-CU-NEXT:    s_clause 0x1
3771; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3772; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3773; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3774; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3775; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3776; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3777; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3778; GFX10-CU-NEXT:    s_endpgm
3779;
3780; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3781; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3782; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3783; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3784; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3787; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3788; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3789; SKIP-CACHE-INV-NEXT:    s_endpgm
3790;
3791; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3792; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3793; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3794; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3797; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3798; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3799; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3800;
3801; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
3802; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3803; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3804; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3805; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3808; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3809; GFX90A-TGSPLIT-NEXT:    s_endpgm
3810;
3811;
3812    i32* %out, i32 %in) {
3813entry:
3814  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
3815  ret void
3816}
3817
3818define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
3819; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
3820; GFX7:       ; %bb.0: ; %entry
3821; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3822; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3823; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3824; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3825; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3826; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3827; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3828; GFX7-NEXT:    s_endpgm
3829;
3830; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
3831; GFX10-WGP:       ; %bb.0: ; %entry
3832; GFX10-WGP-NEXT:    s_clause 0x1
3833; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3834; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3835; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3836; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3837; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3838; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3839; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3840; GFX10-WGP-NEXT:    s_endpgm
3841;
3842; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
3843; GFX10-CU:       ; %bb.0: ; %entry
3844; GFX10-CU-NEXT:    s_clause 0x1
3845; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3846; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3847; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3848; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3849; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3850; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3851; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3852; GFX10-CU-NEXT:    s_endpgm
3853;
3854; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw:
3855; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3856; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3857; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3858; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3862; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3863; SKIP-CACHE-INV-NEXT:    s_endpgm
3864;
3865; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
3866; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3867; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3868; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3870; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3871; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3872; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3873; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3874;
3875; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
3876; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3877; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3878; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3879; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3881; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3882; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3883; GFX90A-TGSPLIT-NEXT:    s_endpgm
3884;
3885;
3886    i32* %out, i32 %in) {
3887entry:
3888  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release
3889  ret void
3890}
3891
3892define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
3893; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3894; GFX7:       ; %bb.0: ; %entry
3895; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3896; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3897; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3899; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3900; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3901; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3902; GFX7-NEXT:    s_endpgm
3903;
3904; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3905; GFX10-WGP:       ; %bb.0: ; %entry
3906; GFX10-WGP-NEXT:    s_clause 0x1
3907; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3908; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3909; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3910; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3911; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3912; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3913; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3914; GFX10-WGP-NEXT:    s_endpgm
3915;
3916; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3917; GFX10-CU:       ; %bb.0: ; %entry
3918; GFX10-CU-NEXT:    s_clause 0x1
3919; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3920; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3921; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3922; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3923; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3924; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3925; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3926; GFX10-CU-NEXT:    s_endpgm
3927;
3928; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3929; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3930; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3931; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3932; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3934; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3935; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3936; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3937; SKIP-CACHE-INV-NEXT:    s_endpgm
3938;
3939; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3940; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3941; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3942; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3943; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3944; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3945; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3946; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3947; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3948;
3949; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
3950; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3951; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3952; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
3953; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3954; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3955; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
3956; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
3957; GFX90A-TGSPLIT-NEXT:    s_endpgm
3958;
3959;
3960    i32* %out, i32 %in) {
3961entry:
3962  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
3963  ret void
3964}
3965
3966define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
3967; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
3968; GFX7:       ; %bb.0: ; %entry
3969; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3970; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3971; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3972; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3973; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3974; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3975; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3976; GFX7-NEXT:    s_endpgm
3977;
3978; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
3979; GFX10-WGP:       ; %bb.0: ; %entry
3980; GFX10-WGP-NEXT:    s_clause 0x1
3981; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3982; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3983; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3984; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3985; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3986; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3987; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3988; GFX10-WGP-NEXT:    s_endpgm
3989;
3990; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
3991; GFX10-CU:       ; %bb.0: ; %entry
3992; GFX10-CU-NEXT:    s_clause 0x1
3993; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3994; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3995; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3996; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3997; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3998; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3999; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
4000; GFX10-CU-NEXT:    s_endpgm
4001;
4002; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
4003; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4004; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4005; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4006; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4007; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4009; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4010; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
4011; SKIP-CACHE-INV-NEXT:    s_endpgm
4012;
4013; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
4014; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4015; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4016; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4017; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4018; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4019; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4020; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4021; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4022;
4023; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
4024; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4025; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4026; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4027; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4028; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4029; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4030; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
4031; GFX90A-TGSPLIT-NEXT:    s_endpgm
4032;
4033;
4034    i32* %out, i32 %in) {
4035entry:
4036  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
4037  ret void
4038}
4039
4040define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
4041; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4042; GFX7:       ; %bb.0: ; %entry
4043; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4044; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4045; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4046; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4047; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4048; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4049; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4050; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4051; GFX7-NEXT:    flat_store_dword v[0:1], v2
4052; GFX7-NEXT:    s_endpgm
4053;
4054; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4055; GFX10-WGP:       ; %bb.0: ; %entry
4056; GFX10-WGP-NEXT:    s_clause 0x1
4057; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4058; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4059; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4060; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4061; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4062; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4063; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4064; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4065; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4066; GFX10-WGP-NEXT:    s_endpgm
4067;
4068; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4069; GFX10-CU:       ; %bb.0: ; %entry
4070; GFX10-CU-NEXT:    s_clause 0x1
4071; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4072; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4073; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4074; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4075; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4076; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4077; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4078; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4079; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4080; GFX10-CU-NEXT:    s_endpgm
4081;
4082; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4083; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4084; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4085; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4086; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4088; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4090; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4091; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4092; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4093; SKIP-CACHE-INV-NEXT:    s_endpgm
4094;
4095; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4096; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4097; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4098; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4099; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4100; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4101; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4102; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4103; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4104; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4105; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4106;
4107; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
4108; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4109; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4110; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4111; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4113; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4114; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4115; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4116; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4117; GFX90A-TGSPLIT-NEXT:    s_endpgm
4118;
4119;
4120    i32* %out, i32 %in) {
4121entry:
4122  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
4123  store i32 %val, i32* %out, align 4
4124  ret void
4125}
4126
4127define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
4128; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4129; GFX7:       ; %bb.0: ; %entry
4130; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4131; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4132; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4133; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4134; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4135; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4136; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4137; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4138; GFX7-NEXT:    flat_store_dword v[0:1], v2
4139; GFX7-NEXT:    s_endpgm
4140;
4141; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4142; GFX10-WGP:       ; %bb.0: ; %entry
4143; GFX10-WGP-NEXT:    s_clause 0x1
4144; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4145; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4146; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4147; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4148; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4149; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4150; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4151; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4152; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4153; GFX10-WGP-NEXT:    s_endpgm
4154;
4155; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4156; GFX10-CU:       ; %bb.0: ; %entry
4157; GFX10-CU-NEXT:    s_clause 0x1
4158; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4159; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4160; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4161; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4163; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4164; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4165; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4166; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4167; GFX10-CU-NEXT:    s_endpgm
4168;
4169; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4170; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4171; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4172; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4173; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4174; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4175; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4176; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4177; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4178; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4179; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4180; SKIP-CACHE-INV-NEXT:    s_endpgm
4181;
4182; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4183; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4184; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4185; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4186; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4187; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4188; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4189; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4190; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4191; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4192; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4193;
4194; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
4195; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4196; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4197; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4198; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4199; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4200; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4201; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4202; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4203; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4204; GFX90A-TGSPLIT-NEXT:    s_endpgm
4205;
4206;
4207    i32* %out, i32 %in) {
4208entry:
4209  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
4210  store i32 %val, i32* %out, align 4
4211  ret void
4212}
4213
4214define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
4215; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4216; GFX7:       ; %bb.0: ; %entry
4217; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4218; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
4219; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4220; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4221; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4222; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4223; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4224; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4225; GFX7-NEXT:    flat_store_dword v[0:1], v2
4226; GFX7-NEXT:    s_endpgm
4227;
4228; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4229; GFX10-WGP:       ; %bb.0: ; %entry
4230; GFX10-WGP-NEXT:    s_clause 0x1
4231; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4232; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
4233; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4234; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4235; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4236; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4237; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4238; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4239; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4240; GFX10-WGP-NEXT:    s_endpgm
4241;
4242; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4243; GFX10-CU:       ; %bb.0: ; %entry
4244; GFX10-CU-NEXT:    s_clause 0x1
4245; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4246; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
4247; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4248; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4249; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4250; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4251; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4252; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4253; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4254; GFX10-CU-NEXT:    s_endpgm
4255;
4256; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4257; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4258; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4259; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
4260; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4261; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4262; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4263; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4264; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4265; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4266; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4267; SKIP-CACHE-INV-NEXT:    s_endpgm
4268;
4269; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4270; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4271; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4272; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4273; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4274; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4275; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4276; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4277; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4278; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4279; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4280;
4281; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
4282; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4283; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4284; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
4285; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4286; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4287; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4288; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4289; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4290; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4291; GFX90A-TGSPLIT-NEXT:    s_endpgm
4292;
4293;
4294    i32* %out, i32 %in) {
4295entry:
4296  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
4297  store i32 %val, i32* %out, align 4
4298  ret void
4299}
4300
4301define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
4302; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4303; GFX7:       ; %bb.0: ; %entry
4304; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4305; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4306; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4307; GFX7-NEXT:    s_add_u32 s0, s0, 16
4308; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4309; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4310; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4311; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4312; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4313; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4314; GFX7-NEXT:    s_endpgm
4315;
4316; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4317; GFX10-WGP:       ; %bb.0: ; %entry
4318; GFX10-WGP-NEXT:    s_clause 0x1
4319; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4320; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4321; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4322; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4323; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4324; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4325; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4326; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4327; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4328; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4329; GFX10-WGP-NEXT:    s_endpgm
4330;
4331; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4332; GFX10-CU:       ; %bb.0: ; %entry
4333; GFX10-CU-NEXT:    s_clause 0x1
4334; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4335; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4336; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4337; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4338; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4339; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4340; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4341; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4342; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4343; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4344; GFX10-CU-NEXT:    s_endpgm
4345;
4346; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4347; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4348; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4349; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4350; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4351; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4352; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4355; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4357; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4358; SKIP-CACHE-INV-NEXT:    s_endpgm
4359;
4360; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4361; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4362; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4363; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4364; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4365; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4366; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4367; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4368; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4369;
4370; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
4371; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4372; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4373; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4374; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4375; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4376; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4377; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4378; GFX90A-TGSPLIT-NEXT:    s_endpgm
4379;
4380;
4381    i32* %out, i32 %in, i32 %old) {
4382entry:
4383  %gep = getelementptr i32, i32* %out, i32 4
4384  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
4385  ret void
4386}
4387
4388define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
4389; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4390; GFX7:       ; %bb.0: ; %entry
4391; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4392; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4393; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4394; GFX7-NEXT:    s_add_u32 s0, s0, 16
4395; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4396; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4397; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4398; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4399; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4400; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4401; GFX7-NEXT:    s_endpgm
4402;
4403; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4404; GFX10-WGP:       ; %bb.0: ; %entry
4405; GFX10-WGP-NEXT:    s_clause 0x1
4406; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4407; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4408; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4409; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4410; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4411; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4412; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4413; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4414; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4415; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4416; GFX10-WGP-NEXT:    s_endpgm
4417;
4418; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4419; GFX10-CU:       ; %bb.0: ; %entry
4420; GFX10-CU-NEXT:    s_clause 0x1
4421; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4422; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4423; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4424; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4425; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4426; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4427; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4428; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4429; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4430; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4431; GFX10-CU-NEXT:    s_endpgm
4432;
4433; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4434; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4435; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4436; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4437; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4438; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4439; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4440; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4443; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4444; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4445; SKIP-CACHE-INV-NEXT:    s_endpgm
4446;
4447; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4448; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4449; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4450; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4451; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4452; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4453; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4454; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4455; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4456;
4457; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
4458; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4459; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4460; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4461; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4462; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4463; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4464; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4465; GFX90A-TGSPLIT-NEXT:    s_endpgm
4466;
4467;
4468    i32* %out, i32 %in, i32 %old) {
4469entry:
4470  %gep = getelementptr i32, i32* %out, i32 4
4471  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
4472  ret void
4473}
4474
4475define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
4476; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4477; GFX7:       ; %bb.0: ; %entry
4478; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4479; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4480; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4481; GFX7-NEXT:    s_add_u32 s0, s0, 16
4482; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4483; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4484; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4485; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4486; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4487; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4488; GFX7-NEXT:    s_endpgm
4489;
4490; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4491; GFX10-WGP:       ; %bb.0: ; %entry
4492; GFX10-WGP-NEXT:    s_clause 0x1
4493; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4494; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4495; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4496; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4497; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4499; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4500; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4501; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4502; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4503; GFX10-WGP-NEXT:    s_endpgm
4504;
4505; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4506; GFX10-CU:       ; %bb.0: ; %entry
4507; GFX10-CU-NEXT:    s_clause 0x1
4508; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4509; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4510; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4511; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4512; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4513; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4514; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4515; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4516; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4517; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4518; GFX10-CU-NEXT:    s_endpgm
4519;
4520; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4521; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4522; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4523; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4524; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4525; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4526; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4529; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4530; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4531; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4532; SKIP-CACHE-INV-NEXT:    s_endpgm
4533;
4534; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4535; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4536; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4537; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4538; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4539; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4540; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4541; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4542; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4543;
4544; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
4545; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4546; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4547; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4548; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4549; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4550; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4551; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4552; GFX90A-TGSPLIT-NEXT:    s_endpgm
4553;
4554;
4555    i32* %out, i32 %in, i32 %old) {
4556entry:
4557  %gep = getelementptr i32, i32* %out, i32 4
4558  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
4559  ret void
4560}
4561
4562define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
4563; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4564; GFX7:       ; %bb.0: ; %entry
4565; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4566; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4567; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4568; GFX7-NEXT:    s_add_u32 s0, s0, 16
4569; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4570; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4571; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4572; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4573; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4574; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4575; GFX7-NEXT:    s_endpgm
4576;
4577; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4578; GFX10-WGP:       ; %bb.0: ; %entry
4579; GFX10-WGP-NEXT:    s_clause 0x1
4580; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4581; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4582; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4583; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4584; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4585; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4586; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4587; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4588; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4589; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4590; GFX10-WGP-NEXT:    s_endpgm
4591;
4592; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4593; GFX10-CU:       ; %bb.0: ; %entry
4594; GFX10-CU-NEXT:    s_clause 0x1
4595; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4596; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4597; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4598; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4599; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4600; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4601; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4602; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4603; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4604; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4605; GFX10-CU-NEXT:    s_endpgm
4606;
4607; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4608; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4609; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4610; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4611; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4612; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4613; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4616; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4618; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4619; SKIP-CACHE-INV-NEXT:    s_endpgm
4620;
4621; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4622; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4623; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4624; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4625; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4626; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4627; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4628; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4629; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4630;
4631; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
4632; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4633; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4634; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4635; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4636; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4637; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4638; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4639; GFX90A-TGSPLIT-NEXT:    s_endpgm
4640;
4641;
4642    i32* %out, i32 %in, i32 %old) {
4643entry:
4644  %gep = getelementptr i32, i32* %out, i32 4
4645  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
4646  ret void
4647}
4648
4649define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
4650; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4651; GFX7:       ; %bb.0: ; %entry
4652; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4653; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4654; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4655; GFX7-NEXT:    s_add_u32 s0, s0, 16
4656; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4657; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4658; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4659; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4660; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4661; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4662; GFX7-NEXT:    s_endpgm
4663;
4664; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4665; GFX10-WGP:       ; %bb.0: ; %entry
4666; GFX10-WGP-NEXT:    s_clause 0x1
4667; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4668; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4669; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4670; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4671; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4672; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4673; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4674; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4675; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4676; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4677; GFX10-WGP-NEXT:    s_endpgm
4678;
4679; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4680; GFX10-CU:       ; %bb.0: ; %entry
4681; GFX10-CU-NEXT:    s_clause 0x1
4682; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4683; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4684; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4685; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4686; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4687; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4688; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4689; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4690; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4691; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4692; GFX10-CU-NEXT:    s_endpgm
4693;
4694; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4695; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4696; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4697; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4698; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4699; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4700; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4701; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4702; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4703; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4705; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4706; SKIP-CACHE-INV-NEXT:    s_endpgm
4707;
4708; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4709; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4710; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4711; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4712; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4713; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4714; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4715; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4716; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4717;
4718; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
4719; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4720; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4721; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4722; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4723; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4724; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4725; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4726; GFX90A-TGSPLIT-NEXT:    s_endpgm
4727;
4728;
4729    i32* %out, i32 %in, i32 %old) {
4730entry:
4731  %gep = getelementptr i32, i32* %out, i32 4
4732  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
4733  ret void
4734}
4735
4736define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
4737; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4738; GFX7:       ; %bb.0: ; %entry
4739; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4740; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4741; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4742; GFX7-NEXT:    s_add_u32 s0, s0, 16
4743; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4744; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4745; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4746; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4747; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4748; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4749; GFX7-NEXT:    s_endpgm
4750;
4751; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4752; GFX10-WGP:       ; %bb.0: ; %entry
4753; GFX10-WGP-NEXT:    s_clause 0x1
4754; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4755; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4756; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4757; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4758; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4759; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4760; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4761; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4762; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4763; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4764; GFX10-WGP-NEXT:    s_endpgm
4765;
4766; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4767; GFX10-CU:       ; %bb.0: ; %entry
4768; GFX10-CU-NEXT:    s_clause 0x1
4769; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4770; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4771; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4772; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4773; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4774; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4775; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4776; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4777; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4778; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4779; GFX10-CU-NEXT:    s_endpgm
4780;
4781; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4782; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4783; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4784; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4785; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4786; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4787; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4788; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4789; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4790; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4791; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4792; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4793; SKIP-CACHE-INV-NEXT:    s_endpgm
4794;
4795; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4796; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4797; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4798; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4799; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4800; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4801; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4802; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4803; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4804;
4805; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
4806; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4807; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4808; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4809; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4810; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4811; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4812; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4813; GFX90A-TGSPLIT-NEXT:    s_endpgm
4814;
4815;
4816    i32* %out, i32 %in, i32 %old) {
4817entry:
4818  %gep = getelementptr i32, i32* %out, i32 4
4819  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
4820  ret void
4821}
4822
4823define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
4824; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4825; GFX7:       ; %bb.0: ; %entry
4826; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4827; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4828; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4829; GFX7-NEXT:    s_add_u32 s0, s0, 16
4830; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4831; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4832; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4833; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4834; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4835; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4836; GFX7-NEXT:    s_endpgm
4837;
4838; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4839; GFX10-WGP:       ; %bb.0: ; %entry
4840; GFX10-WGP-NEXT:    s_clause 0x1
4841; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4842; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4843; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4844; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4845; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4846; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4847; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4848; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4849; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4850; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4851; GFX10-WGP-NEXT:    s_endpgm
4852;
4853; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4854; GFX10-CU:       ; %bb.0: ; %entry
4855; GFX10-CU-NEXT:    s_clause 0x1
4856; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4857; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4858; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4859; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4860; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4861; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4862; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4863; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4864; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4865; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4866; GFX10-CU-NEXT:    s_endpgm
4867;
4868; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4869; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4870; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4871; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4872; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4873; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4874; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4876; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4877; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4879; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4880; SKIP-CACHE-INV-NEXT:    s_endpgm
4881;
4882; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4883; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4884; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4885; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4886; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4887; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4888; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4889; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4890; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4891;
4892; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
4893; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4894; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4895; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4896; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4897; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4898; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4899; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4900; GFX90A-TGSPLIT-NEXT:    s_endpgm
4901;
4902;
4903    i32* %out, i32 %in, i32 %old) {
4904entry:
4905  %gep = getelementptr i32, i32* %out, i32 4
4906  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
4907  ret void
4908}
4909
4910define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
4911; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4912; GFX7:       ; %bb.0: ; %entry
4913; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4914; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4915; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4916; GFX7-NEXT:    s_add_u32 s0, s0, 16
4917; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4918; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4919; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4920; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4921; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4922; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4923; GFX7-NEXT:    s_endpgm
4924;
4925; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4926; GFX10-WGP:       ; %bb.0: ; %entry
4927; GFX10-WGP-NEXT:    s_clause 0x1
4928; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4929; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4930; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4931; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4932; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4933; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4934; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4935; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4936; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4937; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4938; GFX10-WGP-NEXT:    s_endpgm
4939;
4940; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4941; GFX10-CU:       ; %bb.0: ; %entry
4942; GFX10-CU-NEXT:    s_clause 0x1
4943; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4944; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4945; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4946; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4947; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4948; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4949; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4950; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4951; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4952; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4953; GFX10-CU-NEXT:    s_endpgm
4954;
4955; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4956; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4957; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4959; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4960; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4961; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4962; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4966; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4967; SKIP-CACHE-INV-NEXT:    s_endpgm
4968;
4969; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4970; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4971; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4972; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4973; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4974; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4975; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4976; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4977; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4978;
4979; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
4980; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4981; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4982; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4983; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4984; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4985; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4986; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4987; GFX90A-TGSPLIT-NEXT:    s_endpgm
4988;
4989;
4990    i32* %out, i32 %in, i32 %old) {
4991entry:
4992  %gep = getelementptr i32, i32* %out, i32 4
4993  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
4994  ret void
4995}
4996
4997define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
4998; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
4999; GFX7:       ; %bb.0: ; %entry
5000; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5001; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5003; GFX7-NEXT:    s_add_u32 s0, s0, 16
5004; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5005; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5006; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5007; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5008; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5009; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5010; GFX7-NEXT:    s_endpgm
5011;
5012; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
5013; GFX10-WGP:       ; %bb.0: ; %entry
5014; GFX10-WGP-NEXT:    s_clause 0x1
5015; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5016; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5017; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5018; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5019; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5020; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5021; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5022; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5023; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5024; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5025; GFX10-WGP-NEXT:    s_endpgm
5026;
5027; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
5028; GFX10-CU:       ; %bb.0: ; %entry
5029; GFX10-CU-NEXT:    s_clause 0x1
5030; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5031; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5032; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5033; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5034; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5035; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5036; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5037; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5038; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5039; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5040; GFX10-CU-NEXT:    s_endpgm
5041;
5042; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
5043; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5044; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5045; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5046; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5047; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5048; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5050; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5051; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5053; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5054; SKIP-CACHE-INV-NEXT:    s_endpgm
5055;
5056; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
5057; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5058; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5059; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5060; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5061; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5062; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5063; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5064; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5065;
5066; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
5067; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5068; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5069; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5070; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5071; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5072; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5073; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5074; GFX90A-TGSPLIT-NEXT:    s_endpgm
5075;
5076;
5077    i32* %out, i32 %in, i32 %old) {
5078entry:
5079  %gep = getelementptr i32, i32* %out, i32 4
5080  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
5081  ret void
5082}
5083
5084define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
5085; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5086; GFX7:       ; %bb.0: ; %entry
5087; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5088; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5089; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5090; GFX7-NEXT:    s_add_u32 s0, s0, 16
5091; GFX7-NEXT:    s_addc_u32 s1, s1, 0
5092; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5093; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5094; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5095; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5096; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5097; GFX7-NEXT:    s_endpgm
5098;
5099; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5100; GFX10-WGP:       ; %bb.0: ; %entry
5101; GFX10-WGP-NEXT:    s_clause 0x1
5102; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5103; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5104; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5105; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
5106; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
5107; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5108; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5109; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5110; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5111; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5112; GFX10-WGP-NEXT:    s_endpgm
5113;
5114; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5115; GFX10-CU:       ; %bb.0: ; %entry
5116; GFX10-CU-NEXT:    s_clause 0x1
5117; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5118; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5119; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5120; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
5121; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
5122; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5123; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5124; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5125; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5126; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5127; GFX10-CU-NEXT:    s_endpgm
5128;
5129; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5130; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5131; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5132; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5133; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5134; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
5135; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
5136; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5137; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5138; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5139; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5140; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5141; SKIP-CACHE-INV-NEXT:    s_endpgm
5142;
5143; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5144; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5145; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5146; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5147; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5148; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5149; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5150; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5151; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5152;
5153; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
5154; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5155; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5156; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5157; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5158; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5159; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5160; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5161; GFX90A-TGSPLIT-NEXT:    s_endpgm
5162;
5163;
5164    i32* %out, i32 %in, i32 %old) {
5165entry:
5166  %gep = getelementptr i32, i32* %out, i32 4
5167  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
5168  ret void
5169}
5170
5171define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
5172; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5173; GFX7:       ; %bb.0: ; %entry
5174; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5175; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5176; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5177; GFX7-NEXT:    s_add_u32 s4, s0, 16
5178; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5179; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5180; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5181; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5182; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5183; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5184; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5185; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5186; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5187; GFX7-NEXT:    flat_store_dword v[0:1], v2
5188; GFX7-NEXT:    s_endpgm
5189;
5190; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5191; GFX10-WGP:       ; %bb.0: ; %entry
5192; GFX10-WGP-NEXT:    s_clause 0x1
5193; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5194; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5195; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5196; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5197; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5198; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5199; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5200; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5201; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5202; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5203; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5204; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5205; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5206; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5207; GFX10-WGP-NEXT:    s_endpgm
5208;
5209; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5210; GFX10-CU:       ; %bb.0: ; %entry
5211; GFX10-CU-NEXT:    s_clause 0x1
5212; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5213; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5214; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5215; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5216; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5217; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5218; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5219; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5220; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5221; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5222; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5223; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5224; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5225; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5226; GFX10-CU-NEXT:    s_endpgm
5227;
5228; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5229; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5230; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5231; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5232; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5233; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5234; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5236; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5237; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5238; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5239; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5240; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5241; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5242; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5243; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5244; SKIP-CACHE-INV-NEXT:    s_endpgm
5245;
5246; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5247; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5248; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5249; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5250; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5251; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5252; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5253; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5254; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5255; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5256; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5257;
5258; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
5259; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5260; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5261; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5262; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5263; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5264; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5265; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5266; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5267; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5268; GFX90A-TGSPLIT-NEXT:    s_endpgm
5269;
5270;
5271    i32* %out, i32 %in, i32 %old) {
5272entry:
5273  %gep = getelementptr i32, i32* %out, i32 4
5274  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
5275  %val0 = extractvalue { i32, i1 } %val, 0
5276  store i32 %val0, i32* %out, align 4
5277  ret void
5278}
5279
5280define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
5281; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5282; GFX7:       ; %bb.0: ; %entry
5283; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5284; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5285; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5286; GFX7-NEXT:    s_add_u32 s4, s0, 16
5287; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5288; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5289; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5290; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5291; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5292; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5293; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5294; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5295; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5296; GFX7-NEXT:    flat_store_dword v[0:1], v2
5297; GFX7-NEXT:    s_endpgm
5298;
5299; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5300; GFX10-WGP:       ; %bb.0: ; %entry
5301; GFX10-WGP-NEXT:    s_clause 0x1
5302; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5303; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5304; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5305; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5306; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5307; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5308; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5310; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5311; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5312; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5313; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5314; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5315; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5316; GFX10-WGP-NEXT:    s_endpgm
5317;
5318; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5319; GFX10-CU:       ; %bb.0: ; %entry
5320; GFX10-CU-NEXT:    s_clause 0x1
5321; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5322; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5323; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5324; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5325; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5326; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5327; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5328; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5329; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5330; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5331; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5332; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5333; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5334; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5335; GFX10-CU-NEXT:    s_endpgm
5336;
5337; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5338; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5339; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5340; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5341; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5342; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5343; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5344; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5345; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5347; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5348; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5351; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5352; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5353; SKIP-CACHE-INV-NEXT:    s_endpgm
5354;
5355; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5356; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5357; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5358; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5359; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5360; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5361; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5362; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5363; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5364; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5365; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5366;
5367; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
5368; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5369; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5370; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5371; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5372; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5373; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5374; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5375; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5376; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5377; GFX90A-TGSPLIT-NEXT:    s_endpgm
5378;
5379;
5380    i32* %out, i32 %in, i32 %old) {
5381entry:
5382  %gep = getelementptr i32, i32* %out, i32 4
5383  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
5384  %val0 = extractvalue { i32, i1 } %val, 0
5385  store i32 %val0, i32* %out, align 4
5386  ret void
5387}
5388
5389define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
5390; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5391; GFX7:       ; %bb.0: ; %entry
5392; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5393; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5394; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5395; GFX7-NEXT:    s_add_u32 s4, s0, 16
5396; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5397; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5398; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5399; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5400; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5401; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5402; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5403; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5404; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5405; GFX7-NEXT:    flat_store_dword v[0:1], v2
5406; GFX7-NEXT:    s_endpgm
5407;
5408; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5409; GFX10-WGP:       ; %bb.0: ; %entry
5410; GFX10-WGP-NEXT:    s_clause 0x1
5411; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5412; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5413; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5414; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5415; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5417; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5418; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5419; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5420; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5421; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5422; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5423; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5424; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5425; GFX10-WGP-NEXT:    s_endpgm
5426;
5427; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5428; GFX10-CU:       ; %bb.0: ; %entry
5429; GFX10-CU-NEXT:    s_clause 0x1
5430; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5431; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5432; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5433; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5434; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5435; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5436; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5437; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5438; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5439; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5440; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5441; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5442; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5443; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5444; GFX10-CU-NEXT:    s_endpgm
5445;
5446; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5447; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5448; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5449; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5450; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5451; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5452; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5454; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5455; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5456; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5457; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5458; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5459; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5460; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5461; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5462; SKIP-CACHE-INV-NEXT:    s_endpgm
5463;
5464; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5465; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5466; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5467; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5468; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5469; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5470; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5471; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5472; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5473; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5474; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5475;
5476; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
5477; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5478; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5479; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5480; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5481; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5482; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5483; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5484; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5485; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5486; GFX90A-TGSPLIT-NEXT:    s_endpgm
5487;
5488;
5489    i32* %out, i32 %in, i32 %old) {
5490entry:
5491  %gep = getelementptr i32, i32* %out, i32 4
5492  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
5493  %val0 = extractvalue { i32, i1 } %val, 0
5494  store i32 %val0, i32* %out, align 4
5495  ret void
5496}
5497
5498define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
5499; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5500; GFX7:       ; %bb.0: ; %entry
5501; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5502; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5503; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5504; GFX7-NEXT:    s_add_u32 s4, s0, 16
5505; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5506; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5507; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5508; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5509; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5510; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5511; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5512; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5513; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5514; GFX7-NEXT:    flat_store_dword v[0:1], v2
5515; GFX7-NEXT:    s_endpgm
5516;
5517; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5518; GFX10-WGP:       ; %bb.0: ; %entry
5519; GFX10-WGP-NEXT:    s_clause 0x1
5520; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5521; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5522; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5523; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5524; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5525; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5526; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5527; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5528; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5529; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5530; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5531; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5532; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5533; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5534; GFX10-WGP-NEXT:    s_endpgm
5535;
5536; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5537; GFX10-CU:       ; %bb.0: ; %entry
5538; GFX10-CU-NEXT:    s_clause 0x1
5539; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5540; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5541; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5542; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5543; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5544; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5545; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5546; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5547; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5548; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5549; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5550; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5551; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5552; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5553; GFX10-CU-NEXT:    s_endpgm
5554;
5555; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5556; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5557; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5558; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5559; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5560; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5561; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5566; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5569; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5570; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5571; SKIP-CACHE-INV-NEXT:    s_endpgm
5572;
5573; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5574; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5575; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5576; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5577; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5578; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5579; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5580; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5581; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5582; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5583; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5584;
5585; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
5586; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5587; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5588; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5589; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5590; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5591; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5592; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5593; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5594; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5595; GFX90A-TGSPLIT-NEXT:    s_endpgm
5596;
5597;
5598    i32* %out, i32 %in, i32 %old) {
5599entry:
5600  %gep = getelementptr i32, i32* %out, i32 4
5601  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
5602  %val0 = extractvalue { i32, i1 } %val, 0
5603  store i32 %val0, i32* %out, align 4
5604  ret void
5605}
5606
5607define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
5608; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5609; GFX7:       ; %bb.0: ; %entry
5610; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5611; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5612; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5613; GFX7-NEXT:    s_add_u32 s4, s0, 16
5614; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5615; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5616; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5617; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5618; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5619; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5620; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5621; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5622; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5623; GFX7-NEXT:    flat_store_dword v[0:1], v2
5624; GFX7-NEXT:    s_endpgm
5625;
5626; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5627; GFX10-WGP:       ; %bb.0: ; %entry
5628; GFX10-WGP-NEXT:    s_clause 0x1
5629; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5630; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5631; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5632; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5633; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5634; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5635; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5636; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5637; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5638; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5639; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5640; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5641; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5642; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5643; GFX10-WGP-NEXT:    s_endpgm
5644;
5645; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5646; GFX10-CU:       ; %bb.0: ; %entry
5647; GFX10-CU-NEXT:    s_clause 0x1
5648; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5649; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5650; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5651; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5652; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5653; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5654; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5655; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5656; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5657; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5658; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5659; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5660; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5661; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5662; GFX10-CU-NEXT:    s_endpgm
5663;
5664; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5665; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5666; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5667; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5668; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5669; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5670; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5671; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5672; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5673; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5674; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5675; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5676; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5678; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5679; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5680; SKIP-CACHE-INV-NEXT:    s_endpgm
5681;
5682; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5683; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5684; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5685; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5686; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5687; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5688; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5689; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5690; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5691; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5692; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5693;
5694; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
5695; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5696; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5697; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5698; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5699; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5700; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5701; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5702; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5703; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5704; GFX90A-TGSPLIT-NEXT:    s_endpgm
5705;
5706;
5707    i32* %out, i32 %in, i32 %old) {
5708entry:
5709  %gep = getelementptr i32, i32* %out, i32 4
5710  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
5711  %val0 = extractvalue { i32, i1 } %val, 0
5712  store i32 %val0, i32* %out, align 4
5713  ret void
5714}
5715
5716define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
5717; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5718; GFX7:       ; %bb.0: ; %entry
5719; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5720; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5722; GFX7-NEXT:    s_add_u32 s4, s0, 16
5723; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5724; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5725; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5726; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5727; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5728; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5729; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5730; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5731; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5732; GFX7-NEXT:    flat_store_dword v[0:1], v2
5733; GFX7-NEXT:    s_endpgm
5734;
5735; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5736; GFX10-WGP:       ; %bb.0: ; %entry
5737; GFX10-WGP-NEXT:    s_clause 0x1
5738; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5739; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5740; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5741; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5742; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5743; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5744; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5745; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5746; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5747; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5749; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5750; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5751; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5752; GFX10-WGP-NEXT:    s_endpgm
5753;
5754; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5755; GFX10-CU:       ; %bb.0: ; %entry
5756; GFX10-CU-NEXT:    s_clause 0x1
5757; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5758; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5759; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5760; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5761; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5762; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5763; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5764; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5765; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5766; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5767; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5768; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5769; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5770; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5771; GFX10-CU-NEXT:    s_endpgm
5772;
5773; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5774; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5775; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5776; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5777; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5778; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5779; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5784; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5787; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5788; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5789; SKIP-CACHE-INV-NEXT:    s_endpgm
5790;
5791; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5792; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5793; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5794; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5796; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5797; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5798; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5799; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5800; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5801; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5802;
5803; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
5804; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5805; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5806; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5807; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5808; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5809; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5810; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5811; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5812; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5813; GFX90A-TGSPLIT-NEXT:    s_endpgm
5814;
5815;
5816    i32* %out, i32 %in, i32 %old) {
5817entry:
5818  %gep = getelementptr i32, i32* %out, i32 4
5819  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
5820  %val0 = extractvalue { i32, i1 } %val, 0
5821  store i32 %val0, i32* %out, align 4
5822  ret void
5823}
5824
5825define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
5826; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5827; GFX7:       ; %bb.0: ; %entry
5828; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5829; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5830; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5831; GFX7-NEXT:    s_add_u32 s4, s0, 16
5832; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5833; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5834; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5835; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5836; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5837; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5838; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5839; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5840; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5841; GFX7-NEXT:    flat_store_dword v[0:1], v2
5842; GFX7-NEXT:    s_endpgm
5843;
5844; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5845; GFX10-WGP:       ; %bb.0: ; %entry
5846; GFX10-WGP-NEXT:    s_clause 0x1
5847; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5848; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5849; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5850; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5851; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5852; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5853; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5854; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5855; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5856; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5857; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5858; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5859; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5860; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5861; GFX10-WGP-NEXT:    s_endpgm
5862;
5863; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5864; GFX10-CU:       ; %bb.0: ; %entry
5865; GFX10-CU-NEXT:    s_clause 0x1
5866; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5867; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5868; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5869; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5870; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5871; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5872; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5873; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5874; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5875; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5876; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5877; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5878; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5879; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5880; GFX10-CU-NEXT:    s_endpgm
5881;
5882; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5883; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5884; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5885; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5886; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5887; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5888; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5889; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5890; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5891; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5893; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5895; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5896; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5897; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5898; SKIP-CACHE-INV-NEXT:    s_endpgm
5899;
5900; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5901; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5902; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5903; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5904; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5905; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5906; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5907; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5908; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5909; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5910; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5911;
5912; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
5913; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5914; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5915; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5916; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5917; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5918; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
5919; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5920; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5921; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5922; GFX90A-TGSPLIT-NEXT:    s_endpgm
5923;
5924;
5925    i32* %out, i32 %in, i32 %old) {
5926entry:
5927  %gep = getelementptr i32, i32* %out, i32 4
5928  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
5929  %val0 = extractvalue { i32, i1 } %val, 0
5930  store i32 %val0, i32* %out, align 4
5931  ret void
5932}
5933
5934define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
5935; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
5936; GFX7:       ; %bb.0: ; %entry
5937; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5938; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5939; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5940; GFX7-NEXT:    s_add_u32 s4, s0, 16
5941; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5942; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5943; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5944; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5945; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5946; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5947; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5948; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5949; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5950; GFX7-NEXT:    flat_store_dword v[0:1], v2
5951; GFX7-NEXT:    s_endpgm
5952;
5953; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
5954; GFX10-WGP:       ; %bb.0: ; %entry
5955; GFX10-WGP-NEXT:    s_clause 0x1
5956; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5957; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5958; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5959; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5960; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5961; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5962; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5963; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5964; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5965; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5966; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5967; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5968; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5969; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5970; GFX10-WGP-NEXT:    s_endpgm
5971;
5972; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
5973; GFX10-CU:       ; %bb.0: ; %entry
5974; GFX10-CU-NEXT:    s_clause 0x1
5975; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5976; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5977; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5978; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5979; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5981; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5982; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5983; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5984; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5985; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5986; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5987; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5988; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5989; GFX10-CU-NEXT:    s_endpgm
5990;
5991; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
5992; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5993; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5994; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5995; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5996; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5997; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5998; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5999; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6000; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
6001; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6002; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6003; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6004; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6005; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6006; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6007; SKIP-CACHE-INV-NEXT:    s_endpgm
6008;
6009; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
6010; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6011; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6012; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6013; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6014; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6015; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6016; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6017; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6018; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6019; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6020;
6021; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
6022; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6023; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6024; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6025; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6026; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6027; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6028; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6029; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6030; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6031; GFX90A-TGSPLIT-NEXT:    s_endpgm
6032;
6033;
6034    i32* %out, i32 %in, i32 %old) {
6035entry:
6036  %gep = getelementptr i32, i32* %out, i32 4
6037  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
6038  %val0 = extractvalue { i32, i1 } %val, 0
6039  store i32 %val0, i32* %out, align 4
6040  ret void
6041}
6042
6043