1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8
9define amdgpu_kernel void @flat_system_unordered_load(
10; GFX7-LABEL: flat_system_unordered_load:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
13; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX7-NEXT:    v_mov_b32_e32 v0, s0
15; GFX7-NEXT:    v_mov_b32_e32 v1, s1
16; GFX7-NEXT:    flat_load_dword v0, v[0:1]
17; GFX7-NEXT:    v_mov_b32_e32 v2, s2
18; GFX7-NEXT:    v_mov_b32_e32 v3, s3
19; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20; GFX7-NEXT:    flat_store_dword v[2:3], v0
21; GFX7-NEXT:    s_endpgm
22;
23; GFX10-WGP-LABEL: flat_system_unordered_load:
24; GFX10-WGP:       ; %bb.0: ; %entry
25; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
26; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
28; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
29; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
30; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
31; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
32; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
33; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
34; GFX10-WGP-NEXT:    s_endpgm
35;
36; GFX10-CU-LABEL: flat_system_unordered_load:
37; GFX10-CU:       ; %bb.0: ; %entry
38; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
39; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
43; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
44; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
45; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
46; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
47; GFX10-CU-NEXT:    s_endpgm
48;
49; SKIP-CACHE-INV-LABEL: flat_system_unordered_load:
50; SKIP-CACHE-INV:       ; %bb.0: ; %entry
51; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
52; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
53; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
54; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
55; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
56; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
57; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
58; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
59; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
60; SKIP-CACHE-INV-NEXT:    s_endpgm
61;
62; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
63; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
64; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
65; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
67; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
68; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
69; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
70; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
71; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
72; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
73; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
74;
75; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
76; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
77; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
78; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
80; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
81; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
82; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
83; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
84; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
85; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
86; GFX90A-TGSPLIT-NEXT:    s_endpgm
87    i32* %in, i32* %out) {
88entry:
89  %val = load atomic i32, i32* %in unordered, align 4
90  store i32 %val, i32* %out
91  ret void
92}
93
94define amdgpu_kernel void @flat_system_monotonic_load(
95; GFX7-LABEL: flat_system_monotonic_load:
96; GFX7:       ; %bb.0: ; %entry
97; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
98; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX7-NEXT:    v_mov_b32_e32 v0, s0
100; GFX7-NEXT:    v_mov_b32_e32 v1, s1
101; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
102; GFX7-NEXT:    v_mov_b32_e32 v2, s2
103; GFX7-NEXT:    v_mov_b32_e32 v3, s3
104; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
105; GFX7-NEXT:    flat_store_dword v[2:3], v0
106; GFX7-NEXT:    s_endpgm
107;
108; GFX10-WGP-LABEL: flat_system_monotonic_load:
109; GFX10-WGP:       ; %bb.0: ; %entry
110; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
111; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
113; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
114; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
115; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
116; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
117; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
118; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
119; GFX10-WGP-NEXT:    s_endpgm
120;
121; GFX10-CU-LABEL: flat_system_monotonic_load:
122; GFX10-CU:       ; %bb.0: ; %entry
123; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
124; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
126; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
127; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
128; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
129; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
130; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
132; GFX10-CU-NEXT:    s_endpgm
133;
134; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load:
135; SKIP-CACHE-INV:       ; %bb.0: ; %entry
136; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
137; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
138; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
139; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
140; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
143; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
144; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
145; SKIP-CACHE-INV-NEXT:    s_endpgm
146;
147; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
148; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
149; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
150; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
152; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
153; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
154; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
155; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
156; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
157; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
158; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
159;
160; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
161; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
162; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
163; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
165; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
166; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
167; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
168; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
169; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
170; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
171; GFX90A-TGSPLIT-NEXT:    s_endpgm
172    i32* %in, i32* %out) {
173entry:
174  %val = load atomic i32, i32* %in monotonic, align 4
175  store i32 %val, i32* %out
176  ret void
177}
178
179define amdgpu_kernel void @flat_system_acquire_load(
180; GFX7-LABEL: flat_system_acquire_load:
181; GFX7:       ; %bb.0: ; %entry
182; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
183; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX7-NEXT:    v_mov_b32_e32 v0, s0
185; GFX7-NEXT:    v_mov_b32_e32 v1, s1
186; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
187; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
188; GFX7-NEXT:    buffer_wbinvl1_vol
189; GFX7-NEXT:    v_mov_b32_e32 v2, s2
190; GFX7-NEXT:    v_mov_b32_e32 v3, s3
191; GFX7-NEXT:    flat_store_dword v[2:3], v0
192; GFX7-NEXT:    s_endpgm
193;
194; GFX10-WGP-LABEL: flat_system_acquire_load:
195; GFX10-WGP:       ; %bb.0: ; %entry
196; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
197; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
199; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
200; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
201; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
202; GFX10-WGP-NEXT:    buffer_gl0_inv
203; GFX10-WGP-NEXT:    buffer_gl1_inv
204; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
205; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
206; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
207; GFX10-WGP-NEXT:    s_endpgm
208;
209; GFX10-CU-LABEL: flat_system_acquire_load:
210; GFX10-CU:       ; %bb.0: ; %entry
211; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
212; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
214; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
215; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
216; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
217; GFX10-CU-NEXT:    buffer_gl0_inv
218; GFX10-CU-NEXT:    buffer_gl1_inv
219; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
220; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
221; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
222; GFX10-CU-NEXT:    s_endpgm
223;
224; SKIP-CACHE-INV-LABEL: flat_system_acquire_load:
225; SKIP-CACHE-INV:       ; %bb.0: ; %entry
226; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
227; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
228; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
229; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
230; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
231; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
234; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
235; SKIP-CACHE-INV-NEXT:    s_endpgm
236;
237; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
238; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
239; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
240; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
242; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
243; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
244; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
245; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
246; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
247; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
248; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
249; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
250; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
251;
252; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
253; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
254; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
258; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
259; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
260; GFX90A-TGSPLIT-NEXT:    buffer_invl2
261; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
262; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
263; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
264; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
265; GFX90A-TGSPLIT-NEXT:    s_endpgm
266    i32* %in, i32* %out) {
267entry:
268  %val = load atomic i32, i32* %in acquire, align 4
269  store i32 %val, i32* %out
270  ret void
271}
272
273define amdgpu_kernel void @flat_system_seq_cst_load(
274; GFX7-LABEL: flat_system_seq_cst_load:
275; GFX7:       ; %bb.0: ; %entry
276; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
277; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX7-NEXT:    v_mov_b32_e32 v0, s0
279; GFX7-NEXT:    v_mov_b32_e32 v1, s1
280; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
281; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
282; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
283; GFX7-NEXT:    buffer_wbinvl1_vol
284; GFX7-NEXT:    v_mov_b32_e32 v2, s2
285; GFX7-NEXT:    v_mov_b32_e32 v3, s3
286; GFX7-NEXT:    flat_store_dword v[2:3], v0
287; GFX7-NEXT:    s_endpgm
288;
289; GFX10-WGP-LABEL: flat_system_seq_cst_load:
290; GFX10-WGP:       ; %bb.0: ; %entry
291; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
292; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
294; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
295; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
296; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
297; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
298; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
299; GFX10-WGP-NEXT:    buffer_gl0_inv
300; GFX10-WGP-NEXT:    buffer_gl1_inv
301; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
302; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
303; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
304; GFX10-WGP-NEXT:    s_endpgm
305;
306; GFX10-CU-LABEL: flat_system_seq_cst_load:
307; GFX10-CU:       ; %bb.0: ; %entry
308; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
309; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
311; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
312; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
313; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
314; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
315; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
316; GFX10-CU-NEXT:    buffer_gl0_inv
317; GFX10-CU-NEXT:    buffer_gl1_inv
318; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
319; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
320; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
321; GFX10-CU-NEXT:    s_endpgm
322;
323; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load:
324; SKIP-CACHE-INV:       ; %bb.0: ; %entry
325; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
326; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
329; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
330; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
331; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
333; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
334; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
335; SKIP-CACHE-INV-NEXT:    s_endpgm
336;
337; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
338; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
340; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
342; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
343; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
344; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
345; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
346; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
347; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
348; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
349; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
350; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
351; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
352;
353; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
354; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
355; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
356; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
358; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
359; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
360; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
361; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
362; GFX90A-TGSPLIT-NEXT:    buffer_invl2
363; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
364; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
365; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
366; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
367; GFX90A-TGSPLIT-NEXT:    s_endpgm
368    i32* %in, i32* %out) {
369entry:
370  %val = load atomic i32, i32* %in seq_cst, align 4
371  store i32 %val, i32* %out
372  ret void
373}
374
375define amdgpu_kernel void @flat_system_unordered_store(
376; GFX7-LABEL: flat_system_unordered_store:
377; GFX7:       ; %bb.0: ; %entry
378; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
379; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
380; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX7-NEXT:    v_mov_b32_e32 v2, s2
382; GFX7-NEXT:    v_mov_b32_e32 v0, s0
383; GFX7-NEXT:    v_mov_b32_e32 v1, s1
384; GFX7-NEXT:    flat_store_dword v[0:1], v2
385; GFX7-NEXT:    s_endpgm
386;
387; GFX10-WGP-LABEL: flat_system_unordered_store:
388; GFX10-WGP:       ; %bb.0: ; %entry
389; GFX10-WGP-NEXT:    s_clause 0x1
390; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
391; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
392; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
394; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
395; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
396; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
397; GFX10-WGP-NEXT:    s_endpgm
398;
399; GFX10-CU-LABEL: flat_system_unordered_store:
400; GFX10-CU:       ; %bb.0: ; %entry
401; GFX10-CU-NEXT:    s_clause 0x1
402; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
403; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
404; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
406; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
407; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
408; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
409; GFX10-CU-NEXT:    s_endpgm
410;
411; SKIP-CACHE-INV-LABEL: flat_system_unordered_store:
412; SKIP-CACHE-INV:       ; %bb.0: ; %entry
413; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
414; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
415; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
419; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
420; SKIP-CACHE-INV-NEXT:    s_endpgm
421;
422; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
423; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
424; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
425; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
426; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
428; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
429; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
430; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
431;
432; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
433; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
434; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
435; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
436; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
438; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
439; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
440; GFX90A-TGSPLIT-NEXT:    s_endpgm
441    i32 %in, i32* %out) {
442entry:
443  store atomic i32 %in, i32* %out unordered, align 4
444  ret void
445}
446
447define amdgpu_kernel void @flat_system_monotonic_store(
448; GFX7-LABEL: flat_system_monotonic_store:
449; GFX7:       ; %bb.0: ; %entry
450; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
451; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
452; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
453; GFX7-NEXT:    v_mov_b32_e32 v2, s2
454; GFX7-NEXT:    v_mov_b32_e32 v0, s0
455; GFX7-NEXT:    v_mov_b32_e32 v1, s1
456; GFX7-NEXT:    flat_store_dword v[0:1], v2
457; GFX7-NEXT:    s_endpgm
458;
459; GFX10-WGP-LABEL: flat_system_monotonic_store:
460; GFX10-WGP:       ; %bb.0: ; %entry
461; GFX10-WGP-NEXT:    s_clause 0x1
462; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
463; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
464; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
466; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
467; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
468; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
469; GFX10-WGP-NEXT:    s_endpgm
470;
471; GFX10-CU-LABEL: flat_system_monotonic_store:
472; GFX10-CU:       ; %bb.0: ; %entry
473; GFX10-CU-NEXT:    s_clause 0x1
474; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
475; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
476; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
478; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
479; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
480; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
481; GFX10-CU-NEXT:    s_endpgm
482;
483; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store:
484; SKIP-CACHE-INV:       ; %bb.0: ; %entry
485; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
486; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
487; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
491; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
492; SKIP-CACHE-INV-NEXT:    s_endpgm
493;
494; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
495; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
496; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
498; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
500; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
501; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
503;
504; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
506; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
507; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
508; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
510; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
511; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
512; GFX90A-TGSPLIT-NEXT:    s_endpgm
513    i32 %in, i32* %out) {
514entry:
515  store atomic i32 %in, i32* %out monotonic, align 4
516  ret void
517}
518
519define amdgpu_kernel void @flat_system_release_store(
520; GFX7-LABEL: flat_system_release_store:
521; GFX7:       ; %bb.0: ; %entry
522; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
523; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
524; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX7-NEXT:    v_mov_b32_e32 v2, s2
526; GFX7-NEXT:    v_mov_b32_e32 v0, s0
527; GFX7-NEXT:    v_mov_b32_e32 v1, s1
528; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
529; GFX7-NEXT:    flat_store_dword v[0:1], v2
530; GFX7-NEXT:    s_endpgm
531;
532; GFX10-WGP-LABEL: flat_system_release_store:
533; GFX10-WGP:       ; %bb.0: ; %entry
534; GFX10-WGP-NEXT:    s_clause 0x1
535; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
536; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
537; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
539; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
540; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
541; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
542; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
543; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
544; GFX10-WGP-NEXT:    s_endpgm
545;
546; GFX10-CU-LABEL: flat_system_release_store:
547; GFX10-CU:       ; %bb.0: ; %entry
548; GFX10-CU-NEXT:    s_clause 0x1
549; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
550; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
551; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
553; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
554; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
555; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
556; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
557; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
558; GFX10-CU-NEXT:    s_endpgm
559;
560; SKIP-CACHE-INV-LABEL: flat_system_release_store:
561; SKIP-CACHE-INV:       ; %bb.0: ; %entry
562; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
563; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
564; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
566; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
568; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
569; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
570; SKIP-CACHE-INV-NEXT:    s_endpgm
571;
572; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
573; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
574; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
575; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
576; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
578; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
579; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
581; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
582; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
583;
584; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
585; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
586; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
587; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
588; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
589; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
590; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
591; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
592; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
593; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
594; GFX90A-TGSPLIT-NEXT:    s_endpgm
595    i32 %in, i32* %out) {
596entry:
597  store atomic i32 %in, i32* %out release, align 4
598  ret void
599}
600
601define amdgpu_kernel void @flat_system_seq_cst_store(
602; GFX7-LABEL: flat_system_seq_cst_store:
603; GFX7:       ; %bb.0: ; %entry
604; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
605; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
606; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX7-NEXT:    v_mov_b32_e32 v2, s2
608; GFX7-NEXT:    v_mov_b32_e32 v0, s0
609; GFX7-NEXT:    v_mov_b32_e32 v1, s1
610; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
611; GFX7-NEXT:    flat_store_dword v[0:1], v2
612; GFX7-NEXT:    s_endpgm
613;
614; GFX10-WGP-LABEL: flat_system_seq_cst_store:
615; GFX10-WGP:       ; %bb.0: ; %entry
616; GFX10-WGP-NEXT:    s_clause 0x1
617; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
618; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
619; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
620; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
621; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
622; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
623; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
624; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
625; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
626; GFX10-WGP-NEXT:    s_endpgm
627;
628; GFX10-CU-LABEL: flat_system_seq_cst_store:
629; GFX10-CU:       ; %bb.0: ; %entry
630; GFX10-CU-NEXT:    s_clause 0x1
631; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
632; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
633; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
635; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
636; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
637; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
638; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
639; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
640; GFX10-CU-NEXT:    s_endpgm
641;
642; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store:
643; SKIP-CACHE-INV:       ; %bb.0: ; %entry
644; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
645; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
646; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
649; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
650; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
651; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
652; SKIP-CACHE-INV-NEXT:    s_endpgm
653;
654; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
655; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
657; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
658; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
660; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
661; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
662; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
663; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
664; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
665;
666; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
667; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
668; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
669; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
670; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
672; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
673; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
674; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
675; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
676; GFX90A-TGSPLIT-NEXT:    s_endpgm
677    i32 %in, i32* %out) {
678entry:
679  store atomic i32 %in, i32* %out seq_cst, align 4
680  ret void
681}
682
683define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
684; GFX7-LABEL: flat_system_monotonic_atomicrmw:
685; GFX7:       ; %bb.0: ; %entry
686; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
687; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
688; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX7-NEXT:    v_mov_b32_e32 v0, s0
690; GFX7-NEXT:    v_mov_b32_e32 v1, s1
691; GFX7-NEXT:    v_mov_b32_e32 v2, s2
692; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
693; GFX7-NEXT:    s_endpgm
694;
695; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
696; GFX10-WGP:       ; %bb.0: ; %entry
697; GFX10-WGP-NEXT:    s_clause 0x1
698; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
699; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
700; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
702; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
703; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
704; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
705; GFX10-WGP-NEXT:    s_endpgm
706;
707; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
708; GFX10-CU:       ; %bb.0: ; %entry
709; GFX10-CU-NEXT:    s_clause 0x1
710; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
711; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
712; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
714; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
715; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
716; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
717; GFX10-CU-NEXT:    s_endpgm
718;
719; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw:
720; SKIP-CACHE-INV:       ; %bb.0: ; %entry
721; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
722; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
723; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
724; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
725; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
726; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
727; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
728; SKIP-CACHE-INV-NEXT:    s_endpgm
729;
730; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
731; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
732; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
733; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
734; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
736; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
737; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
738; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
739;
740; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
741; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
742; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
743; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
744; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
746; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
747; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
748; GFX90A-TGSPLIT-NEXT:    s_endpgm
749    i32* %out, i32 %in) {
750entry:
751  %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic
752  ret void
753}
754
755define amdgpu_kernel void @flat_system_acquire_atomicrmw(
756; GFX7-LABEL: flat_system_acquire_atomicrmw:
757; GFX7:       ; %bb.0: ; %entry
758; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
759; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
760; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
761; GFX7-NEXT:    v_mov_b32_e32 v0, s0
762; GFX7-NEXT:    v_mov_b32_e32 v1, s1
763; GFX7-NEXT:    v_mov_b32_e32 v2, s2
764; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
765; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
766; GFX7-NEXT:    buffer_wbinvl1_vol
767; GFX7-NEXT:    s_endpgm
768;
769; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
770; GFX10-WGP:       ; %bb.0: ; %entry
771; GFX10-WGP-NEXT:    s_clause 0x1
772; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
773; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
774; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
775; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
776; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
777; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
778; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
779; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
781; GFX10-WGP-NEXT:    buffer_gl0_inv
782; GFX10-WGP-NEXT:    buffer_gl1_inv
783; GFX10-WGP-NEXT:    s_endpgm
784;
785; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
786; GFX10-CU:       ; %bb.0: ; %entry
787; GFX10-CU-NEXT:    s_clause 0x1
788; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
789; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
790; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
792; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
793; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
794; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
795; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
797; GFX10-CU-NEXT:    buffer_gl0_inv
798; GFX10-CU-NEXT:    buffer_gl1_inv
799; GFX10-CU-NEXT:    s_endpgm
800;
801; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw:
802; SKIP-CACHE-INV:       ; %bb.0: ; %entry
803; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
804; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
805; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
806; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
809; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
810; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
811; SKIP-CACHE-INV-NEXT:    s_endpgm
812;
813; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
814; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
815; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
816; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
817; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
819; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
820; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
821; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
822; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
823; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
824; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
825;
826; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
827; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
828; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
829; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
830; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
832; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
833; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
834; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
835; GFX90A-TGSPLIT-NEXT:    buffer_invl2
836; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
837; GFX90A-TGSPLIT-NEXT:    s_endpgm
838    i32* %out, i32 %in) {
839entry:
840  %val = atomicrmw volatile xchg i32* %out, i32 %in acquire
841  ret void
842}
843
844define amdgpu_kernel void @flat_system_release_atomicrmw(
845; GFX7-LABEL: flat_system_release_atomicrmw:
846; GFX7:       ; %bb.0: ; %entry
847; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
848; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
849; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
850; GFX7-NEXT:    v_mov_b32_e32 v0, s0
851; GFX7-NEXT:    v_mov_b32_e32 v1, s1
852; GFX7-NEXT:    v_mov_b32_e32 v2, s2
853; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
854; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
855; GFX7-NEXT:    s_endpgm
856;
857; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
858; GFX10-WGP:       ; %bb.0: ; %entry
859; GFX10-WGP-NEXT:    s_clause 0x1
860; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
861; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
862; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
863; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
864; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
865; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
866; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
867; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
868; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
869; GFX10-WGP-NEXT:    s_endpgm
870;
871; GFX10-CU-LABEL: flat_system_release_atomicrmw:
872; GFX10-CU:       ; %bb.0: ; %entry
873; GFX10-CU-NEXT:    s_clause 0x1
874; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
875; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
876; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
878; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
879; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
880; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
881; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
882; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
883; GFX10-CU-NEXT:    s_endpgm
884;
885; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw:
886; SKIP-CACHE-INV:       ; %bb.0: ; %entry
887; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
888; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
889; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
890; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
891; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
893; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
894; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
895; SKIP-CACHE-INV-NEXT:    s_endpgm
896;
897; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
898; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
899; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
900; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
901; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
903; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
904; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
905; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
906; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
907; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
908;
909; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
910; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
911; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
912; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
913; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
915; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
916; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
917; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
918; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
919; GFX90A-TGSPLIT-NEXT:    s_endpgm
920    i32* %out, i32 %in) {
921entry:
922  %val = atomicrmw volatile xchg i32* %out, i32 %in release
923  ret void
924}
925
926define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
927; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
928; GFX7:       ; %bb.0: ; %entry
929; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
930; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
931; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX7-NEXT:    v_mov_b32_e32 v0, s0
933; GFX7-NEXT:    v_mov_b32_e32 v1, s1
934; GFX7-NEXT:    v_mov_b32_e32 v2, s2
935; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
936; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
937; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
938; GFX7-NEXT:    buffer_wbinvl1_vol
939; GFX7-NEXT:    s_endpgm
940;
941; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
942; GFX10-WGP:       ; %bb.0: ; %entry
943; GFX10-WGP-NEXT:    s_clause 0x1
944; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
945; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
946; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
948; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
949; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
950; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
951; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
952; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
953; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
955; GFX10-WGP-NEXT:    buffer_gl0_inv
956; GFX10-WGP-NEXT:    buffer_gl1_inv
957; GFX10-WGP-NEXT:    s_endpgm
958;
959; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
960; GFX10-CU:       ; %bb.0: ; %entry
961; GFX10-CU-NEXT:    s_clause 0x1
962; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
963; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
964; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
965; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
966; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
967; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
968; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
969; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
970; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
971; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
973; GFX10-CU-NEXT:    buffer_gl0_inv
974; GFX10-CU-NEXT:    buffer_gl1_inv
975; GFX10-CU-NEXT:    s_endpgm
976;
977; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw:
978; SKIP-CACHE-INV:       ; %bb.0: ; %entry
979; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
980; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
981; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
985; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
987; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
988; SKIP-CACHE-INV-NEXT:    s_endpgm
989;
990; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
991; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
992; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
993; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
994; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
996; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
997; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
998; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
999; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1001; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1002; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1003; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1004;
1005; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
1006; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1007; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1008; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1009; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1011; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1012; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1013; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1014; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1015; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1016; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1017; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1018; GFX90A-TGSPLIT-NEXT:    s_endpgm
1019    i32* %out, i32 %in) {
1020entry:
1021  %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
1022  ret void
1023}
1024
1025define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
1026; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
1027; GFX7:       ; %bb.0: ; %entry
1028; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1029; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1030; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1032; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1033; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1034; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1035; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1036; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1037; GFX7-NEXT:    buffer_wbinvl1_vol
1038; GFX7-NEXT:    s_endpgm
1039;
1040; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
1041; GFX10-WGP:       ; %bb.0: ; %entry
1042; GFX10-WGP-NEXT:    s_clause 0x1
1043; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1044; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1045; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1047; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1048; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1049; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1050; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1051; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1052; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1054; GFX10-WGP-NEXT:    buffer_gl0_inv
1055; GFX10-WGP-NEXT:    buffer_gl1_inv
1056; GFX10-WGP-NEXT:    s_endpgm
1057;
1058; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
1059; GFX10-CU:       ; %bb.0: ; %entry
1060; GFX10-CU-NEXT:    s_clause 0x1
1061; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1062; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1063; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1065; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1066; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1067; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1068; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1069; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1070; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1072; GFX10-CU-NEXT:    buffer_gl0_inv
1073; GFX10-CU-NEXT:    buffer_gl1_inv
1074; GFX10-CU-NEXT:    s_endpgm
1075;
1076; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw:
1077; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1078; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1079; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1080; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1081; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1082; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1083; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1084; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1085; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1086; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1087; SKIP-CACHE-INV-NEXT:    s_endpgm
1088;
1089; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
1090; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1091; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1092; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1093; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1095; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1096; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1097; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1099; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1100; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1101; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1102; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1103;
1104; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
1105; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1106; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1107; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1108; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1110; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1111; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1112; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1113; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1114; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1115; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1116; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1117; GFX90A-TGSPLIT-NEXT:    s_endpgm
1118    i32* %out, i32 %in) {
1119entry:
1120  %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
1121  ret void
1122}
1123
1124define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
1125; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
1126; GFX7:       ; %bb.0: ; %entry
1127; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1128; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1129; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1131; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1132; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1133; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1134; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1135; GFX7-NEXT:    buffer_wbinvl1_vol
1136; GFX7-NEXT:    flat_store_dword v[0:1], v2
1137; GFX7-NEXT:    s_endpgm
1138;
1139; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
1140; GFX10-WGP:       ; %bb.0: ; %entry
1141; GFX10-WGP-NEXT:    s_clause 0x1
1142; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1143; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1144; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1146; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1147; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1148; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1149; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1150; GFX10-WGP-NEXT:    buffer_gl0_inv
1151; GFX10-WGP-NEXT:    buffer_gl1_inv
1152; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1153; GFX10-WGP-NEXT:    s_endpgm
1154;
1155; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
1156; GFX10-CU:       ; %bb.0: ; %entry
1157; GFX10-CU-NEXT:    s_clause 0x1
1158; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1159; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1160; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1163; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1164; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1165; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1166; GFX10-CU-NEXT:    buffer_gl0_inv
1167; GFX10-CU-NEXT:    buffer_gl1_inv
1168; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1169; GFX10-CU-NEXT:    s_endpgm
1170;
1171; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw:
1172; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1173; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1174; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1175; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1176; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1177; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1178; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1179; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1180; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1181; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1182; SKIP-CACHE-INV-NEXT:    s_endpgm
1183;
1184; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
1185; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1186; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1187; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1188; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1190; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1191; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1192; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1193; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1194; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1195; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1196; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1197;
1198; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
1199; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1200; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1201; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1202; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1204; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1205; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1206; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1207; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1208; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1209; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1210; GFX90A-TGSPLIT-NEXT:    s_endpgm
1211    i32* %out, i32 %in) {
1212entry:
1213  %val = atomicrmw volatile xchg i32* %out, i32 %in acquire
1214  store i32 %val, i32* %out, align 4
1215  ret void
1216}
1217
1218define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
1219; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
1220; GFX7:       ; %bb.0: ; %entry
1221; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1222; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1223; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1224; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1225; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1226; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1227; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1228; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1229; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1230; GFX7-NEXT:    buffer_wbinvl1_vol
1231; GFX7-NEXT:    flat_store_dword v[0:1], v2
1232; GFX7-NEXT:    s_endpgm
1233;
1234; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
1235; GFX10-WGP:       ; %bb.0: ; %entry
1236; GFX10-WGP-NEXT:    s_clause 0x1
1237; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1238; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1239; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1240; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1241; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1242; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1243; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1244; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1245; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1246; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1247; GFX10-WGP-NEXT:    buffer_gl0_inv
1248; GFX10-WGP-NEXT:    buffer_gl1_inv
1249; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1250; GFX10-WGP-NEXT:    s_endpgm
1251;
1252; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
1253; GFX10-CU:       ; %bb.0: ; %entry
1254; GFX10-CU-NEXT:    s_clause 0x1
1255; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1256; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1257; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1259; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1260; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1261; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1262; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1263; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1264; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1265; GFX10-CU-NEXT:    buffer_gl0_inv
1266; GFX10-CU-NEXT:    buffer_gl1_inv
1267; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1268; GFX10-CU-NEXT:    s_endpgm
1269;
1270; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw:
1271; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1272; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1273; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1274; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1275; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1276; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1277; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1278; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1279; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1280; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1281; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1282; SKIP-CACHE-INV-NEXT:    s_endpgm
1283;
1284; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
1285; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1286; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1288; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1290; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1291; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1292; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1293; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1294; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1295; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1296; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1297; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1298; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1299;
1300; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
1301; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1302; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1303; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1304; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1306; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1307; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1308; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1309; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1310; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1311; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1312; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1313; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1314; GFX90A-TGSPLIT-NEXT:    s_endpgm
1315    i32* %out, i32 %in) {
1316entry:
1317  %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
1318  store i32 %val, i32* %out, align 4
1319  ret void
1320}
1321
1322define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
1323; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
1324; GFX7:       ; %bb.0: ; %entry
1325; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1326; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1327; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1328; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1329; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1330; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1331; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1332; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1333; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1334; GFX7-NEXT:    buffer_wbinvl1_vol
1335; GFX7-NEXT:    flat_store_dword v[0:1], v2
1336; GFX7-NEXT:    s_endpgm
1337;
1338; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
1339; GFX10-WGP:       ; %bb.0: ; %entry
1340; GFX10-WGP-NEXT:    s_clause 0x1
1341; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1342; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1343; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1345; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1346; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1347; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1348; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1349; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1350; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1351; GFX10-WGP-NEXT:    buffer_gl0_inv
1352; GFX10-WGP-NEXT:    buffer_gl1_inv
1353; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1354; GFX10-WGP-NEXT:    s_endpgm
1355;
1356; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
1357; GFX10-CU:       ; %bb.0: ; %entry
1358; GFX10-CU-NEXT:    s_clause 0x1
1359; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1360; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1361; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1363; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1364; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1365; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1366; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1367; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1368; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1369; GFX10-CU-NEXT:    buffer_gl0_inv
1370; GFX10-CU-NEXT:    buffer_gl1_inv
1371; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1372; GFX10-CU-NEXT:    s_endpgm
1373;
1374; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw:
1375; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1376; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1377; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1378; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1379; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1380; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1381; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1382; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1383; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1384; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1385; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1386; SKIP-CACHE-INV-NEXT:    s_endpgm
1387;
1388; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
1389; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1390; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1391; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1392; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1394; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1395; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1396; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1397; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1398; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1399; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1400; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1401; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1402; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1403;
1404; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
1405; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1406; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1407; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1408; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1410; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1411; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1412; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1413; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1414; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1415; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1416; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1417; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1418; GFX90A-TGSPLIT-NEXT:    s_endpgm
1419    i32* %out, i32 %in) {
1420entry:
1421  %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
1422  store i32 %val, i32* %out, align 4
1423  ret void
1424}
1425
1426define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
1427; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1428; GFX7:       ; %bb.0: ; %entry
1429; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1430; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1431; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1432; GFX7-NEXT:    s_add_u32 s0, s0, 16
1433; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1434; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1435; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1436; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1437; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1438; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1439; GFX7-NEXT:    s_endpgm
1440;
1441; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1442; GFX10-WGP:       ; %bb.0: ; %entry
1443; GFX10-WGP-NEXT:    s_clause 0x1
1444; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1445; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1446; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1447; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1448; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1449; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1450; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1451; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1452; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1453; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1454; GFX10-WGP-NEXT:    s_endpgm
1455;
1456; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1457; GFX10-CU:       ; %bb.0: ; %entry
1458; GFX10-CU-NEXT:    s_clause 0x1
1459; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1460; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1461; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1462; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1463; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1464; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1465; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1466; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1467; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1468; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1469; GFX10-CU-NEXT:    s_endpgm
1470;
1471; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1472; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1473; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1474; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1475; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1476; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1477; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1478; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1479; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1480; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1481; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1482; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1483; SKIP-CACHE-INV-NEXT:    s_endpgm
1484;
1485; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1486; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1487; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1488; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1489; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1491; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1492; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1493; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1494;
1495; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
1496; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1497; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1498; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1499; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1500; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1501; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1502; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1503; GFX90A-TGSPLIT-NEXT:    s_endpgm
1504    i32* %out, i32 %in, i32 %old) {
1505entry:
1506  %gep = getelementptr i32, i32* %out, i32 4
1507  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic
1508  ret void
1509}
1510
1511define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
1512; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
1513; GFX7:       ; %bb.0: ; %entry
1514; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1515; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1516; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX7-NEXT:    s_add_u32 s0, s0, 16
1518; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1519; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1520; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1521; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1522; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1523; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1524; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1525; GFX7-NEXT:    buffer_wbinvl1_vol
1526; GFX7-NEXT:    s_endpgm
1527;
1528; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
1529; GFX10-WGP:       ; %bb.0: ; %entry
1530; GFX10-WGP-NEXT:    s_clause 0x1
1531; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1532; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1533; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1534; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1535; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1536; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1537; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1538; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1539; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1540; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1541; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1542; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1543; GFX10-WGP-NEXT:    buffer_gl0_inv
1544; GFX10-WGP-NEXT:    buffer_gl1_inv
1545; GFX10-WGP-NEXT:    s_endpgm
1546;
1547; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
1548; GFX10-CU:       ; %bb.0: ; %entry
1549; GFX10-CU-NEXT:    s_clause 0x1
1550; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1551; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1552; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1554; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1555; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1556; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1557; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1558; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1559; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1560; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1562; GFX10-CU-NEXT:    buffer_gl0_inv
1563; GFX10-CU-NEXT:    buffer_gl1_inv
1564; GFX10-CU-NEXT:    s_endpgm
1565;
1566; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg:
1567; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1568; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1569; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1570; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1571; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1572; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1573; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1574; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1576; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1577; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1578; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1579; SKIP-CACHE-INV-NEXT:    s_endpgm
1580;
1581; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
1582; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1583; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1584; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1585; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1587; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1588; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1589; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1590; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1591; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1592; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1593;
1594; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
1595; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1596; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1597; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1598; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1600; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1601; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1602; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1603; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1604; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1605; GFX90A-TGSPLIT-NEXT:    s_endpgm
1606    i32* %out, i32 %in, i32 %old) {
1607entry:
1608  %gep = getelementptr i32, i32* %out, i32 4
1609  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic
1610  ret void
1611}
1612
1613define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
1614; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
1615; GFX7:       ; %bb.0: ; %entry
1616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1617; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX7-NEXT:    s_add_u32 s0, s0, 16
1620; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1621; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1623; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1624; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1625; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1626; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1627; GFX7-NEXT:    s_endpgm
1628;
1629; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
1630; GFX10-WGP:       ; %bb.0: ; %entry
1631; GFX10-WGP-NEXT:    s_clause 0x1
1632; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1633; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1634; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1635; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1636; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1637; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1638; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1639; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1640; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1641; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1642; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1643; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1644; GFX10-WGP-NEXT:    s_endpgm
1645;
1646; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
1647; GFX10-CU:       ; %bb.0: ; %entry
1648; GFX10-CU-NEXT:    s_clause 0x1
1649; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1650; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1651; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1652; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1653; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1654; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1655; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1656; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1657; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1658; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1659; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1660; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1661; GFX10-CU-NEXT:    s_endpgm
1662;
1663; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg:
1664; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1665; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1666; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1667; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1668; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1669; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1670; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1671; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1672; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1673; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1674; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1675; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1676; SKIP-CACHE-INV-NEXT:    s_endpgm
1677;
1678; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
1679; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1680; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1681; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1682; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1684; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1685; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1686; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1687; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1688; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1689;
1690; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
1691; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1692; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1693; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1694; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1696; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1697; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1698; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1699; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1700; GFX90A-TGSPLIT-NEXT:    s_endpgm
1701    i32* %out, i32 %in, i32 %old) {
1702entry:
1703  %gep = getelementptr i32, i32* %out, i32 4
1704  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic
1705  ret void
1706}
1707
1708define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
1709; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1710; GFX7:       ; %bb.0: ; %entry
1711; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1712; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1713; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1714; GFX7-NEXT:    s_add_u32 s0, s0, 16
1715; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1716; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1717; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1718; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1719; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1720; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1721; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1722; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1723; GFX7-NEXT:    buffer_wbinvl1_vol
1724; GFX7-NEXT:    s_endpgm
1725;
1726; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1727; GFX10-WGP:       ; %bb.0: ; %entry
1728; GFX10-WGP-NEXT:    s_clause 0x1
1729; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1730; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1731; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1732; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1733; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1734; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1735; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1736; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1737; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1738; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1739; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1740; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1741; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1742; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1743; GFX10-WGP-NEXT:    buffer_gl0_inv
1744; GFX10-WGP-NEXT:    buffer_gl1_inv
1745; GFX10-WGP-NEXT:    s_endpgm
1746;
1747; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1748; GFX10-CU:       ; %bb.0: ; %entry
1749; GFX10-CU-NEXT:    s_clause 0x1
1750; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1751; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1752; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1754; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1756; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1757; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1758; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1759; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1760; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1761; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1762; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1764; GFX10-CU-NEXT:    buffer_gl0_inv
1765; GFX10-CU-NEXT:    buffer_gl1_inv
1766; GFX10-CU-NEXT:    s_endpgm
1767;
1768; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1769; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1770; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1771; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1772; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1773; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1774; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1775; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1776; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1779; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1780; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1781; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1782; SKIP-CACHE-INV-NEXT:    s_endpgm
1783;
1784; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1785; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1786; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1787; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1788; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1790; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1791; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1792; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1793; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1794; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1795; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1796; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1797; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1798;
1799; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
1800; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1801; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1802; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1803; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1805; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1806; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1807; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1808; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1809; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1810; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1811; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1812; GFX90A-TGSPLIT-NEXT:    s_endpgm
1813    i32* %out, i32 %in, i32 %old) {
1814entry:
1815  %gep = getelementptr i32, i32* %out, i32 4
1816  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic
1817  ret void
1818}
1819
1820define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
1821; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1822; GFX7:       ; %bb.0: ; %entry
1823; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1824; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1825; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX7-NEXT:    s_add_u32 s0, s0, 16
1827; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1828; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1829; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1830; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1831; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1832; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1833; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1834; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1835; GFX7-NEXT:    buffer_wbinvl1_vol
1836; GFX7-NEXT:    s_endpgm
1837;
1838; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1839; GFX10-WGP:       ; %bb.0: ; %entry
1840; GFX10-WGP-NEXT:    s_clause 0x1
1841; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1842; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1843; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1845; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1846; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1847; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1848; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1849; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1850; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1851; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1852; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1853; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1854; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1855; GFX10-WGP-NEXT:    buffer_gl0_inv
1856; GFX10-WGP-NEXT:    buffer_gl1_inv
1857; GFX10-WGP-NEXT:    s_endpgm
1858;
1859; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1860; GFX10-CU:       ; %bb.0: ; %entry
1861; GFX10-CU-NEXT:    s_clause 0x1
1862; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1863; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1864; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1866; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1867; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1868; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1869; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1870; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1871; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1872; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1873; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1874; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1875; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1876; GFX10-CU-NEXT:    buffer_gl0_inv
1877; GFX10-CU-NEXT:    buffer_gl1_inv
1878; GFX10-CU-NEXT:    s_endpgm
1879;
1880; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1881; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1882; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1883; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1884; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1885; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1886; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1887; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1888; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1889; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1890; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1891; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1892; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1893; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1894; SKIP-CACHE-INV-NEXT:    s_endpgm
1895;
1896; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1897; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1898; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1899; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1900; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1901; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1902; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1903; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1904; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1905; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1906; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1907; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1908; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1909; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1910;
1911; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
1912; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1913; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1914; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1915; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1916; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1917; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1918; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1919; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1920; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
1921; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1922; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1923; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1924; GFX90A-TGSPLIT-NEXT:    s_endpgm
1925    i32* %out, i32 %in, i32 %old) {
1926entry:
1927  %gep = getelementptr i32, i32* %out, i32 4
1928  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic
1929  ret void
1930}
1931
1932define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
1933; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
1934; GFX7:       ; %bb.0: ; %entry
1935; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1936; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1937; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX7-NEXT:    s_add_u32 s0, s0, 16
1939; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1940; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1941; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1942; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1943; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1944; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1945; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1946; GFX7-NEXT:    buffer_wbinvl1_vol
1947; GFX7-NEXT:    s_endpgm
1948;
1949; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
1950; GFX10-WGP:       ; %bb.0: ; %entry
1951; GFX10-WGP-NEXT:    s_clause 0x1
1952; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1953; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1954; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1956; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1957; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1958; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1959; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1960; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1961; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1962; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1963; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1964; GFX10-WGP-NEXT:    buffer_gl0_inv
1965; GFX10-WGP-NEXT:    buffer_gl1_inv
1966; GFX10-WGP-NEXT:    s_endpgm
1967;
1968; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
1969; GFX10-CU:       ; %bb.0: ; %entry
1970; GFX10-CU-NEXT:    s_clause 0x1
1971; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1972; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1973; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1974; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1975; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1976; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1977; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1978; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1979; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1980; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1981; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1983; GFX10-CU-NEXT:    buffer_gl0_inv
1984; GFX10-CU-NEXT:    buffer_gl1_inv
1985; GFX10-CU-NEXT:    s_endpgm
1986;
1987; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg:
1988; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1989; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1990; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1991; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1992; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1993; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1994; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1995; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1996; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1997; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1998; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1999; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2000; SKIP-CACHE-INV-NEXT:    s_endpgm
2001;
2002; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
2003; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2004; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2005; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2006; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2008; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2009; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2010; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2011; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2012; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2013; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2014;
2015; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
2016; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2017; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2018; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2019; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2021; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2022; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2023; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2024; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2025; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2026; GFX90A-TGSPLIT-NEXT:    s_endpgm
2027    i32* %out, i32 %in, i32 %old) {
2028entry:
2029  %gep = getelementptr i32, i32* %out, i32 4
2030  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire
2031  ret void
2032}
2033
2034define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
2035; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
2036; GFX7:       ; %bb.0: ; %entry
2037; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2038; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2039; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2040; GFX7-NEXT:    s_add_u32 s0, s0, 16
2041; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2042; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2043; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2044; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2045; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2046; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2047; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2048; GFX7-NEXT:    buffer_wbinvl1_vol
2049; GFX7-NEXT:    s_endpgm
2050;
2051; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
2052; GFX10-WGP:       ; %bb.0: ; %entry
2053; GFX10-WGP-NEXT:    s_clause 0x1
2054; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2055; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2056; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2057; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2058; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2059; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2060; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2061; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2062; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2063; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2064; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2065; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2066; GFX10-WGP-NEXT:    buffer_gl0_inv
2067; GFX10-WGP-NEXT:    buffer_gl1_inv
2068; GFX10-WGP-NEXT:    s_endpgm
2069;
2070; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
2071; GFX10-CU:       ; %bb.0: ; %entry
2072; GFX10-CU-NEXT:    s_clause 0x1
2073; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2074; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2075; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2076; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2077; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2079; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2080; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2081; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2082; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2083; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2084; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2085; GFX10-CU-NEXT:    buffer_gl0_inv
2086; GFX10-CU-NEXT:    buffer_gl1_inv
2087; GFX10-CU-NEXT:    s_endpgm
2088;
2089; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg:
2090; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2091; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2092; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2093; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2094; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2095; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2097; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2099; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2100; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2101; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2102; SKIP-CACHE-INV-NEXT:    s_endpgm
2103;
2104; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
2105; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2106; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2107; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2109; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2110; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2111; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2113; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2114; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2115; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2116;
2117; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
2118; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2119; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2120; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2121; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2122; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2123; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2124; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2125; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2126; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2127; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2128; GFX90A-TGSPLIT-NEXT:    s_endpgm
2129    i32* %out, i32 %in, i32 %old) {
2130entry:
2131  %gep = getelementptr i32, i32* %out, i32 4
2132  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire
2133  ret void
2134}
2135
2136define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
2137; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
2138; GFX7:       ; %bb.0: ; %entry
2139; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2140; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2141; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX7-NEXT:    s_add_u32 s0, s0, 16
2143; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2144; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2145; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2146; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2147; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2148; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2149; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2150; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2151; GFX7-NEXT:    buffer_wbinvl1_vol
2152; GFX7-NEXT:    s_endpgm
2153;
2154; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
2155; GFX10-WGP:       ; %bb.0: ; %entry
2156; GFX10-WGP-NEXT:    s_clause 0x1
2157; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2158; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2159; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2161; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2162; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2163; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2164; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2165; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2166; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2167; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2168; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2169; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2170; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2171; GFX10-WGP-NEXT:    buffer_gl0_inv
2172; GFX10-WGP-NEXT:    buffer_gl1_inv
2173; GFX10-WGP-NEXT:    s_endpgm
2174;
2175; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
2176; GFX10-CU:       ; %bb.0: ; %entry
2177; GFX10-CU-NEXT:    s_clause 0x1
2178; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2179; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2180; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2182; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2183; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2184; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2185; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2186; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2187; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2188; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2189; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2190; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2191; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2192; GFX10-CU-NEXT:    buffer_gl0_inv
2193; GFX10-CU-NEXT:    buffer_gl1_inv
2194; GFX10-CU-NEXT:    s_endpgm
2195;
2196; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg:
2197; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2198; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2199; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2200; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2201; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2202; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2203; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2206; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2207; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2208; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2209; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2210; SKIP-CACHE-INV-NEXT:    s_endpgm
2211;
2212; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
2213; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2214; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2215; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2216; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2217; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2218; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2219; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2220; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2221; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2223; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2224; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2225; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2226;
2227; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
2228; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2229; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2230; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2231; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2233; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2234; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2235; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2236; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2237; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2238; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2239; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2240; GFX90A-TGSPLIT-NEXT:    s_endpgm
2241    i32* %out, i32 %in, i32 %old) {
2242entry:
2243  %gep = getelementptr i32, i32* %out, i32 4
2244  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire
2245  ret void
2246}
2247
2248define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
2249; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2250; GFX7:       ; %bb.0: ; %entry
2251; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2252; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2253; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2254; GFX7-NEXT:    s_add_u32 s0, s0, 16
2255; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2256; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2257; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2258; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2259; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2260; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2261; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2262; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2263; GFX7-NEXT:    buffer_wbinvl1_vol
2264; GFX7-NEXT:    s_endpgm
2265;
2266; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2267; GFX10-WGP:       ; %bb.0: ; %entry
2268; GFX10-WGP-NEXT:    s_clause 0x1
2269; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2270; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2271; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2272; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2273; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2274; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2275; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2276; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2277; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2278; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2279; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2280; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2281; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2282; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2283; GFX10-WGP-NEXT:    buffer_gl0_inv
2284; GFX10-WGP-NEXT:    buffer_gl1_inv
2285; GFX10-WGP-NEXT:    s_endpgm
2286;
2287; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2288; GFX10-CU:       ; %bb.0: ; %entry
2289; GFX10-CU-NEXT:    s_clause 0x1
2290; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2291; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2292; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2293; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2294; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2296; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2297; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2298; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2299; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2300; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2301; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2302; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2303; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2304; GFX10-CU-NEXT:    buffer_gl0_inv
2305; GFX10-CU-NEXT:    buffer_gl1_inv
2306; GFX10-CU-NEXT:    s_endpgm
2307;
2308; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2309; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2310; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2311; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2312; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2313; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2314; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2316; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2317; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2318; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2319; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2320; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2321; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2322; SKIP-CACHE-INV-NEXT:    s_endpgm
2323;
2324; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2325; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2326; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2327; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2328; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2329; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2330; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2331; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2332; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2333; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2334; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2335; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2336; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2337; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2338;
2339; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
2340; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2341; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2342; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2343; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2344; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2345; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2346; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2347; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2348; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2349; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2350; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2351; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2352; GFX90A-TGSPLIT-NEXT:    s_endpgm
2353    i32* %out, i32 %in, i32 %old) {
2354entry:
2355  %gep = getelementptr i32, i32* %out, i32 4
2356  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire
2357  ret void
2358}
2359
2360define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
2361; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2362; GFX7:       ; %bb.0: ; %entry
2363; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2364; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2365; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2366; GFX7-NEXT:    s_add_u32 s0, s0, 16
2367; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2368; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2369; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2370; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2371; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2372; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2373; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2374; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2375; GFX7-NEXT:    buffer_wbinvl1_vol
2376; GFX7-NEXT:    s_endpgm
2377;
2378; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2379; GFX10-WGP:       ; %bb.0: ; %entry
2380; GFX10-WGP-NEXT:    s_clause 0x1
2381; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2382; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2383; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2384; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2385; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2386; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2387; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2388; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2389; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2390; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2391; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2392; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2393; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2394; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2395; GFX10-WGP-NEXT:    buffer_gl0_inv
2396; GFX10-WGP-NEXT:    buffer_gl1_inv
2397; GFX10-WGP-NEXT:    s_endpgm
2398;
2399; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2400; GFX10-CU:       ; %bb.0: ; %entry
2401; GFX10-CU-NEXT:    s_clause 0x1
2402; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2403; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2404; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2405; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2406; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2407; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2408; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2409; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2410; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2411; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2412; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2413; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2414; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2415; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2416; GFX10-CU-NEXT:    buffer_gl0_inv
2417; GFX10-CU-NEXT:    buffer_gl1_inv
2418; GFX10-CU-NEXT:    s_endpgm
2419;
2420; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2421; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2422; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2423; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2424; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2425; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2426; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2427; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2428; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2429; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2431; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2432; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2433; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2434; SKIP-CACHE-INV-NEXT:    s_endpgm
2435;
2436; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2437; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2438; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2439; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2440; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2441; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2442; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2443; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2444; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2445; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2446; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2447; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2448; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2449; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2450;
2451; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
2452; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2453; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2454; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2455; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2456; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2457; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2458; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2459; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2460; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2461; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2462; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2463; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2464; GFX90A-TGSPLIT-NEXT:    s_endpgm
2465    i32* %out, i32 %in, i32 %old) {
2466entry:
2467  %gep = getelementptr i32, i32* %out, i32 4
2468  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire
2469  ret void
2470}
2471
2472define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
2473; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2474; GFX7:       ; %bb.0: ; %entry
2475; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2476; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2477; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX7-NEXT:    s_add_u32 s0, s0, 16
2479; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2480; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2481; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2482; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2483; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2484; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2485; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2486; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2487; GFX7-NEXT:    buffer_wbinvl1_vol
2488; GFX7-NEXT:    s_endpgm
2489;
2490; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2491; GFX10-WGP:       ; %bb.0: ; %entry
2492; GFX10-WGP-NEXT:    s_clause 0x1
2493; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2494; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2495; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2496; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2497; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2499; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2500; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2501; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2502; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2503; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2504; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2505; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2507; GFX10-WGP-NEXT:    buffer_gl0_inv
2508; GFX10-WGP-NEXT:    buffer_gl1_inv
2509; GFX10-WGP-NEXT:    s_endpgm
2510;
2511; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2512; GFX10-CU:       ; %bb.0: ; %entry
2513; GFX10-CU-NEXT:    s_clause 0x1
2514; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2515; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2516; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2517; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2518; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2519; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2520; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2521; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2522; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2523; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2524; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2525; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2526; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2527; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2528; GFX10-CU-NEXT:    buffer_gl0_inv
2529; GFX10-CU-NEXT:    buffer_gl1_inv
2530; GFX10-CU-NEXT:    s_endpgm
2531;
2532; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2533; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2534; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2535; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2536; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2537; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2538; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2541; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2543; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2544; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2545; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2546; SKIP-CACHE-INV-NEXT:    s_endpgm
2547;
2548; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2549; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2550; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2552; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2554; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2555; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2556; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2557; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2558; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2559; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2560; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2561; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2562;
2563; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
2564; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2565; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2566; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2567; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2568; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2569; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2570; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2571; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2572; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2573; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2574; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2575; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2576; GFX90A-TGSPLIT-NEXT:    s_endpgm
2577    i32* %out, i32 %in, i32 %old) {
2578entry:
2579  %gep = getelementptr i32, i32* %out, i32 4
2580  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst
2581  ret void
2582}
2583
2584define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
2585; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2586; GFX7:       ; %bb.0: ; %entry
2587; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2588; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2589; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2590; GFX7-NEXT:    s_add_u32 s0, s0, 16
2591; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2592; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2593; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2594; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2595; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2596; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2597; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2598; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2599; GFX7-NEXT:    buffer_wbinvl1_vol
2600; GFX7-NEXT:    s_endpgm
2601;
2602; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2603; GFX10-WGP:       ; %bb.0: ; %entry
2604; GFX10-WGP-NEXT:    s_clause 0x1
2605; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2606; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2607; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2608; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2609; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2610; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2611; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2612; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2613; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2614; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2615; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2616; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2617; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2618; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2619; GFX10-WGP-NEXT:    buffer_gl0_inv
2620; GFX10-WGP-NEXT:    buffer_gl1_inv
2621; GFX10-WGP-NEXT:    s_endpgm
2622;
2623; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2624; GFX10-CU:       ; %bb.0: ; %entry
2625; GFX10-CU-NEXT:    s_clause 0x1
2626; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2627; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2630; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2631; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2632; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2633; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2634; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2635; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2636; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2637; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2638; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2639; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2640; GFX10-CU-NEXT:    buffer_gl0_inv
2641; GFX10-CU-NEXT:    buffer_gl1_inv
2642; GFX10-CU-NEXT:    s_endpgm
2643;
2644; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2645; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2646; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2647; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2648; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2649; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2650; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2651; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2652; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2653; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2654; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2655; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2656; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2657; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2658; SKIP-CACHE-INV-NEXT:    s_endpgm
2659;
2660; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2661; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2662; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2663; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2664; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2665; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2666; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2667; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2668; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2669; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2670; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2671; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2672; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2673; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2674;
2675; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
2676; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2677; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2678; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2679; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2680; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2681; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2682; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2683; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2684; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2685; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2686; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2687; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2688; GFX90A-TGSPLIT-NEXT:    s_endpgm
2689    i32* %out, i32 %in, i32 %old) {
2690entry:
2691  %gep = getelementptr i32, i32* %out, i32 4
2692  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst
2693  ret void
2694}
2695
2696define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
2697; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
2698; GFX7:       ; %bb.0: ; %entry
2699; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2700; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2701; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2702; GFX7-NEXT:    s_add_u32 s0, s0, 16
2703; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2704; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2705; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2706; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2707; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2708; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2709; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2710; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2711; GFX7-NEXT:    buffer_wbinvl1_vol
2712; GFX7-NEXT:    s_endpgm
2713;
2714; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
2715; GFX10-WGP:       ; %bb.0: ; %entry
2716; GFX10-WGP-NEXT:    s_clause 0x1
2717; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2718; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2719; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2720; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2721; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2722; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2723; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2724; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2725; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2726; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2727; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2728; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2729; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2731; GFX10-WGP-NEXT:    buffer_gl0_inv
2732; GFX10-WGP-NEXT:    buffer_gl1_inv
2733; GFX10-WGP-NEXT:    s_endpgm
2734;
2735; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
2736; GFX10-CU:       ; %bb.0: ; %entry
2737; GFX10-CU-NEXT:    s_clause 0x1
2738; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2739; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2740; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2741; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2742; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2743; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2744; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2745; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2746; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2747; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2748; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2749; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2750; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2751; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2752; GFX10-CU-NEXT:    buffer_gl0_inv
2753; GFX10-CU-NEXT:    buffer_gl1_inv
2754; GFX10-CU-NEXT:    s_endpgm
2755;
2756; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg:
2757; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2758; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2759; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2760; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2761; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2762; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2764; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2767; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2768; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2769; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2770; SKIP-CACHE-INV-NEXT:    s_endpgm
2771;
2772; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
2773; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2774; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2775; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2776; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2777; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2778; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2779; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2780; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2781; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2782; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2783; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2784; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2785; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2786;
2787; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
2788; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2789; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2790; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2791; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2792; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2793; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2794; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2795; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2796; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2797; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2798; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2799; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2800; GFX90A-TGSPLIT-NEXT:    s_endpgm
2801    i32* %out, i32 %in, i32 %old) {
2802entry:
2803  %gep = getelementptr i32, i32* %out, i32 4
2804  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst
2805  ret void
2806}
2807
2808define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
2809; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2810; GFX7:       ; %bb.0: ; %entry
2811; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2812; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2813; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2814; GFX7-NEXT:    s_add_u32 s0, s0, 16
2815; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2816; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2817; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2818; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2819; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2820; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2821; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2822; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2823; GFX7-NEXT:    buffer_wbinvl1_vol
2824; GFX7-NEXT:    s_endpgm
2825;
2826; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2827; GFX10-WGP:       ; %bb.0: ; %entry
2828; GFX10-WGP-NEXT:    s_clause 0x1
2829; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2830; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2831; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2832; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2833; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2834; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2835; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2836; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2837; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2838; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2839; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2840; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2841; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2843; GFX10-WGP-NEXT:    buffer_gl0_inv
2844; GFX10-WGP-NEXT:    buffer_gl1_inv
2845; GFX10-WGP-NEXT:    s_endpgm
2846;
2847; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2848; GFX10-CU:       ; %bb.0: ; %entry
2849; GFX10-CU-NEXT:    s_clause 0x1
2850; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2851; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2852; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2853; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2854; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2855; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2856; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2857; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2858; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2859; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2860; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2861; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2862; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2864; GFX10-CU-NEXT:    buffer_gl0_inv
2865; GFX10-CU-NEXT:    buffer_gl1_inv
2866; GFX10-CU-NEXT:    s_endpgm
2867;
2868; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2869; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2870; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2871; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2872; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2873; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2874; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2876; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2877; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2879; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2880; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2881; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2882; SKIP-CACHE-INV-NEXT:    s_endpgm
2883;
2884; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2885; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2886; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2887; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2888; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2889; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2890; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2891; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2892; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2893; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2894; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2895; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2896; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2897; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2898;
2899; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
2900; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2901; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2902; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2903; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2904; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
2905; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
2906; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2907; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2908; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2909; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2910; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2911; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2912; GFX90A-TGSPLIT-NEXT:    s_endpgm
2913    i32* %out, i32 %in, i32 %old) {
2914entry:
2915  %gep = getelementptr i32, i32* %out, i32 4
2916  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst
2917  ret void
2918}
2919
2920define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
2921; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
2922; GFX7:       ; %bb.0: ; %entry
2923; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2924; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2925; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2926; GFX7-NEXT:    s_add_u32 s0, s0, 16
2927; GFX7-NEXT:    s_addc_u32 s1, s1, 0
2928; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2929; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2930; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2931; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2932; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2933; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2934; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2935; GFX7-NEXT:    buffer_wbinvl1_vol
2936; GFX7-NEXT:    s_endpgm
2937;
2938; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
2939; GFX10-WGP:       ; %bb.0: ; %entry
2940; GFX10-WGP-NEXT:    s_clause 0x1
2941; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2942; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2943; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2944; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
2945; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
2946; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2947; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2948; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2949; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2950; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2951; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2952; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2953; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2954; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2955; GFX10-WGP-NEXT:    buffer_gl0_inv
2956; GFX10-WGP-NEXT:    buffer_gl1_inv
2957; GFX10-WGP-NEXT:    s_endpgm
2958;
2959; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
2960; GFX10-CU:       ; %bb.0: ; %entry
2961; GFX10-CU-NEXT:    s_clause 0x1
2962; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2963; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2964; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2965; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
2966; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
2967; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2968; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2969; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2970; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2971; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2972; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2973; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2974; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2975; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2976; GFX10-CU-NEXT:    buffer_gl0_inv
2977; GFX10-CU-NEXT:    buffer_gl1_inv
2978; GFX10-CU-NEXT:    s_endpgm
2979;
2980; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
2981; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2982; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2983; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2984; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2985; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
2986; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
2987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2988; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2989; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2991; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2992; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2993; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2994; SKIP-CACHE-INV-NEXT:    s_endpgm
2995;
2996; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
2997; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2998; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2999; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3001; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3002; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3003; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3004; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3005; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3006; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3007; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3008; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3009; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3010;
3011; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
3012; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3013; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3014; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3015; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3016; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3017; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3018; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3019; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3020; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3021; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3022; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3023; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3024; GFX90A-TGSPLIT-NEXT:    s_endpgm
3025    i32* %out, i32 %in, i32 %old) {
3026entry:
3027  %gep = getelementptr i32, i32* %out, i32 4
3028  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
3029  ret void
3030}
3031
3032define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
3033; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3034; GFX7:       ; %bb.0: ; %entry
3035; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3036; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3037; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3038; GFX7-NEXT:    s_add_u32 s4, s0, 16
3039; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3040; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3041; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3042; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3043; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3044; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3045; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3046; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3047; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3048; GFX7-NEXT:    flat_store_dword v[0:1], v2
3049; GFX7-NEXT:    s_endpgm
3050;
3051; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3052; GFX10-WGP:       ; %bb.0: ; %entry
3053; GFX10-WGP-NEXT:    s_clause 0x1
3054; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3055; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3056; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3057; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3058; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3059; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3060; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3061; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3062; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3063; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3064; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3065; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3066; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3067; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3068; GFX10-WGP-NEXT:    s_endpgm
3069;
3070; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3071; GFX10-CU:       ; %bb.0: ; %entry
3072; GFX10-CU-NEXT:    s_clause 0x1
3073; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3074; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3075; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3076; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3077; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3079; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3080; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3081; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3082; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3083; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3084; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3085; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3086; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3087; GFX10-CU-NEXT:    s_endpgm
3088;
3089; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3090; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3091; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3092; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3093; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3094; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3095; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3097; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3099; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3100; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3101; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3102; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3103; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3104; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3105; SKIP-CACHE-INV-NEXT:    s_endpgm
3106;
3107; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3108; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3109; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3110; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3111; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3112; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3113; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3114; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3115; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3116; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3117; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3118;
3119; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
3120; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3121; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3122; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3123; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3124; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3125; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3126; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3127; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3128; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3129; GFX90A-TGSPLIT-NEXT:    s_endpgm
3130    i32* %out, i32 %in, i32 %old) {
3131entry:
3132  %gep = getelementptr i32, i32* %out, i32 4
3133  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic
3134  %val0 = extractvalue { i32, i1 } %val, 0
3135  store i32 %val0, i32* %out, align 4
3136  ret void
3137}
3138
3139define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
3140; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3141; GFX7:       ; %bb.0: ; %entry
3142; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3143; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3144; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3145; GFX7-NEXT:    s_add_u32 s4, s0, 16
3146; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3147; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3148; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3149; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3150; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3151; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3152; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3153; GFX7-NEXT:    buffer_wbinvl1_vol
3154; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3155; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3156; GFX7-NEXT:    flat_store_dword v[0:1], v2
3157; GFX7-NEXT:    s_endpgm
3158;
3159; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3160; GFX10-WGP:       ; %bb.0: ; %entry
3161; GFX10-WGP-NEXT:    s_clause 0x1
3162; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3163; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3164; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3165; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3166; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3167; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3168; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3169; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3170; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3171; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3172; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3173; GFX10-WGP-NEXT:    buffer_gl0_inv
3174; GFX10-WGP-NEXT:    buffer_gl1_inv
3175; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3176; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3177; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3178; GFX10-WGP-NEXT:    s_endpgm
3179;
3180; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3181; GFX10-CU:       ; %bb.0: ; %entry
3182; GFX10-CU-NEXT:    s_clause 0x1
3183; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3184; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3185; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3186; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3187; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3188; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3189; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3190; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3191; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3192; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3193; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3194; GFX10-CU-NEXT:    buffer_gl0_inv
3195; GFX10-CU-NEXT:    buffer_gl1_inv
3196; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3197; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3198; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3199; GFX10-CU-NEXT:    s_endpgm
3200;
3201; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3202; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3203; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3204; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3205; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3206; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3207; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3210; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3212; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3213; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3216; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3217; SKIP-CACHE-INV-NEXT:    s_endpgm
3218;
3219; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3220; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3221; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3222; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3223; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3224; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3225; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3226; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3227; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3228; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3229; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3230; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3231; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3232;
3233; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
3234; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3235; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3236; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3237; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3238; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3239; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3240; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3241; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3242; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3243; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3244; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3245; GFX90A-TGSPLIT-NEXT:    s_endpgm
3246    i32* %out, i32 %in, i32 %old) {
3247entry:
3248  %gep = getelementptr i32, i32* %out, i32 4
3249  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic
3250  %val0 = extractvalue { i32, i1 } %val, 0
3251  store i32 %val0, i32* %out, align 4
3252  ret void
3253}
3254
3255define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
3256; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3257; GFX7:       ; %bb.0: ; %entry
3258; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3259; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3260; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3261; GFX7-NEXT:    s_add_u32 s4, s0, 16
3262; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3263; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3264; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3265; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3266; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3267; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3268; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3269; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3270; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3271; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3272; GFX7-NEXT:    flat_store_dword v[0:1], v2
3273; GFX7-NEXT:    s_endpgm
3274;
3275; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3276; GFX10-WGP:       ; %bb.0: ; %entry
3277; GFX10-WGP-NEXT:    s_clause 0x1
3278; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3279; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3280; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3281; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3282; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3283; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3284; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3285; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3286; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3287; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3288; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3289; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3290; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3291; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3292; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3293; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3294; GFX10-WGP-NEXT:    s_endpgm
3295;
3296; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3297; GFX10-CU:       ; %bb.0: ; %entry
3298; GFX10-CU-NEXT:    s_clause 0x1
3299; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3300; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3302; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3303; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3304; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3305; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3306; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3307; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3308; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3309; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3310; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3311; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3312; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3313; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3314; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3315; GFX10-CU-NEXT:    s_endpgm
3316;
3317; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3318; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3319; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3321; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3322; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3323; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3328; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3329; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3332; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3333; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3334; SKIP-CACHE-INV-NEXT:    s_endpgm
3335;
3336; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3340; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3341; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3342; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3343; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3344; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3345; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3346; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3347; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3348; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3349;
3350; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
3351; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3352; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3354; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3355; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3356; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3357; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3358; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3359; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3360; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3361; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3362; GFX90A-TGSPLIT-NEXT:    s_endpgm
3363    i32* %out, i32 %in, i32 %old) {
3364entry:
3365  %gep = getelementptr i32, i32* %out, i32 4
3366  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic
3367  %val0 = extractvalue { i32, i1 } %val, 0
3368  store i32 %val0, i32* %out, align 4
3369  ret void
3370}
3371
3372define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
3373; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3374; GFX7:       ; %bb.0: ; %entry
3375; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3376; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3377; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3378; GFX7-NEXT:    s_add_u32 s4, s0, 16
3379; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3380; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3381; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3382; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3383; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3384; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3385; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3386; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3387; GFX7-NEXT:    buffer_wbinvl1_vol
3388; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3389; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3390; GFX7-NEXT:    flat_store_dword v[0:1], v2
3391; GFX7-NEXT:    s_endpgm
3392;
3393; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3394; GFX10-WGP:       ; %bb.0: ; %entry
3395; GFX10-WGP-NEXT:    s_clause 0x1
3396; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3397; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3398; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3399; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3400; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3401; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3402; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3403; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3404; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3405; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3406; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3407; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3408; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3409; GFX10-WGP-NEXT:    buffer_gl0_inv
3410; GFX10-WGP-NEXT:    buffer_gl1_inv
3411; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3412; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3413; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3414; GFX10-WGP-NEXT:    s_endpgm
3415;
3416; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3417; GFX10-CU:       ; %bb.0: ; %entry
3418; GFX10-CU-NEXT:    s_clause 0x1
3419; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3420; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3421; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3422; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3423; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3424; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3425; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3426; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3427; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3428; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3429; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3430; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3431; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3432; GFX10-CU-NEXT:    buffer_gl0_inv
3433; GFX10-CU-NEXT:    buffer_gl1_inv
3434; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3436; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3437; GFX10-CU-NEXT:    s_endpgm
3438;
3439; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3440; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3441; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3443; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3444; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3445; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3446; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3450; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3451; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3452; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3454; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3455; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3456; SKIP-CACHE-INV-NEXT:    s_endpgm
3457;
3458; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3459; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3460; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3461; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3462; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3464; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3465; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3466; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3467; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3468; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3469; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3470; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3471; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3472; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3473;
3474; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
3475; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3476; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3477; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3478; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3479; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3480; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3481; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3482; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3483; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3484; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3485; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3486; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3487; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3488; GFX90A-TGSPLIT-NEXT:    s_endpgm
3489    i32* %out, i32 %in, i32 %old) {
3490entry:
3491  %gep = getelementptr i32, i32* %out, i32 4
3492  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic
3493  %val0 = extractvalue { i32, i1 } %val, 0
3494  store i32 %val0, i32* %out, align 4
3495  ret void
3496}
3497
3498define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
3499; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3500; GFX7:       ; %bb.0: ; %entry
3501; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3502; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3503; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3504; GFX7-NEXT:    s_add_u32 s4, s0, 16
3505; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3506; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3507; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3508; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3509; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3510; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3511; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3512; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3513; GFX7-NEXT:    buffer_wbinvl1_vol
3514; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3515; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3516; GFX7-NEXT:    flat_store_dword v[0:1], v2
3517; GFX7-NEXT:    s_endpgm
3518;
3519; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3520; GFX10-WGP:       ; %bb.0: ; %entry
3521; GFX10-WGP-NEXT:    s_clause 0x1
3522; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3523; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3524; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3525; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3526; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3527; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3528; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3529; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3530; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3531; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3532; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3533; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3534; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3535; GFX10-WGP-NEXT:    buffer_gl0_inv
3536; GFX10-WGP-NEXT:    buffer_gl1_inv
3537; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3538; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3539; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3540; GFX10-WGP-NEXT:    s_endpgm
3541;
3542; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3543; GFX10-CU:       ; %bb.0: ; %entry
3544; GFX10-CU-NEXT:    s_clause 0x1
3545; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3546; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3547; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3548; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3549; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3550; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3551; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3552; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3553; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3554; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3555; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3556; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3557; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3558; GFX10-CU-NEXT:    buffer_gl0_inv
3559; GFX10-CU-NEXT:    buffer_gl1_inv
3560; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3561; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3562; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3563; GFX10-CU-NEXT:    s_endpgm
3564;
3565; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3566; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3567; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3568; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3569; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3570; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3571; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3572; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3573; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3574; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3576; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3577; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3578; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3579; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3580; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3581; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3582; SKIP-CACHE-INV-NEXT:    s_endpgm
3583;
3584; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3585; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3586; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3587; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3588; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3589; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3590; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3591; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3592; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3593; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3594; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3595; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3596; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3597; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3598; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3599;
3600; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
3601; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3602; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3603; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3604; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3605; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3606; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3607; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3608; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3609; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3610; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3611; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3612; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3613; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3614; GFX90A-TGSPLIT-NEXT:    s_endpgm
3615    i32* %out, i32 %in, i32 %old) {
3616entry:
3617  %gep = getelementptr i32, i32* %out, i32 4
3618  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic
3619  %val0 = extractvalue { i32, i1 } %val, 0
3620  store i32 %val0, i32* %out, align 4
3621  ret void
3622}
3623
3624define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
3625; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3626; GFX7:       ; %bb.0: ; %entry
3627; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3628; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3629; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX7-NEXT:    s_add_u32 s4, s0, 16
3631; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3632; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3633; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3634; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3635; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3636; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3637; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3638; GFX7-NEXT:    buffer_wbinvl1_vol
3639; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3640; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3641; GFX7-NEXT:    flat_store_dword v[0:1], v2
3642; GFX7-NEXT:    s_endpgm
3643;
3644; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3645; GFX10-WGP:       ; %bb.0: ; %entry
3646; GFX10-WGP-NEXT:    s_clause 0x1
3647; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3648; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3649; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3650; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3651; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3652; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3653; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3654; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3655; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3656; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3657; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3658; GFX10-WGP-NEXT:    buffer_gl0_inv
3659; GFX10-WGP-NEXT:    buffer_gl1_inv
3660; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3661; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3662; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3663; GFX10-WGP-NEXT:    s_endpgm
3664;
3665; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3666; GFX10-CU:       ; %bb.0: ; %entry
3667; GFX10-CU-NEXT:    s_clause 0x1
3668; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3669; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3670; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3671; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3672; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3673; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3674; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3675; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3676; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3677; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3678; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3679; GFX10-CU-NEXT:    buffer_gl0_inv
3680; GFX10-CU-NEXT:    buffer_gl1_inv
3681; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3682; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3683; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3684; GFX10-CU-NEXT:    s_endpgm
3685;
3686; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3687; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3688; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3689; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3690; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3691; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3692; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3693; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3697; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3698; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3699; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3700; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3701; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3702; SKIP-CACHE-INV-NEXT:    s_endpgm
3703;
3704; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3705; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3706; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3707; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3708; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3709; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3710; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3711; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3712; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3713; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3714; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3715; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3716; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3717;
3718; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
3719; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3720; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3721; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3722; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3723; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3724; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3725; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3726; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3727; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3728; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3729; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3730; GFX90A-TGSPLIT-NEXT:    s_endpgm
3731    i32* %out, i32 %in, i32 %old) {
3732entry:
3733  %gep = getelementptr i32, i32* %out, i32 4
3734  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire
3735  %val0 = extractvalue { i32, i1 } %val, 0
3736  store i32 %val0, i32* %out, align 4
3737  ret void
3738}
3739
3740define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
3741; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3742; GFX7:       ; %bb.0: ; %entry
3743; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3744; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3745; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3746; GFX7-NEXT:    s_add_u32 s4, s0, 16
3747; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3748; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3749; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3750; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3751; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3752; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3753; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3754; GFX7-NEXT:    buffer_wbinvl1_vol
3755; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3756; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3757; GFX7-NEXT:    flat_store_dword v[0:1], v2
3758; GFX7-NEXT:    s_endpgm
3759;
3760; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3761; GFX10-WGP:       ; %bb.0: ; %entry
3762; GFX10-WGP-NEXT:    s_clause 0x1
3763; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3764; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3765; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3766; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3767; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3768; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3769; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3770; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3771; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3772; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3773; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3774; GFX10-WGP-NEXT:    buffer_gl0_inv
3775; GFX10-WGP-NEXT:    buffer_gl1_inv
3776; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3777; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3778; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3779; GFX10-WGP-NEXT:    s_endpgm
3780;
3781; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3782; GFX10-CU:       ; %bb.0: ; %entry
3783; GFX10-CU-NEXT:    s_clause 0x1
3784; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3785; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3786; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3787; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3788; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3789; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3790; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3791; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3792; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3793; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3794; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3795; GFX10-CU-NEXT:    buffer_gl0_inv
3796; GFX10-CU-NEXT:    buffer_gl1_inv
3797; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3798; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3799; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3800; GFX10-CU-NEXT:    s_endpgm
3801;
3802; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3803; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3804; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3805; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3806; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3807; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3808; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3809; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3811; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3813; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3814; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3816; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3817; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3818; SKIP-CACHE-INV-NEXT:    s_endpgm
3819;
3820; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3821; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3822; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3823; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3824; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3825; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3826; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3827; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3828; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3829; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3830; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3831; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3832; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3833;
3834; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
3835; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3836; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3837; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3838; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3839; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3840; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3841; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3842; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3843; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3844; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3845; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3846; GFX90A-TGSPLIT-NEXT:    s_endpgm
3847    i32* %out, i32 %in, i32 %old) {
3848entry:
3849  %gep = getelementptr i32, i32* %out, i32 4
3850  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire
3851  %val0 = extractvalue { i32, i1 } %val, 0
3852  store i32 %val0, i32* %out, align 4
3853  ret void
3854}
3855
3856define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
3857; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
3858; GFX7:       ; %bb.0: ; %entry
3859; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3860; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3861; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3862; GFX7-NEXT:    s_add_u32 s4, s0, 16
3863; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3864; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3865; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3866; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3867; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3868; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3869; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3870; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3871; GFX7-NEXT:    buffer_wbinvl1_vol
3872; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3873; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3874; GFX7-NEXT:    flat_store_dword v[0:1], v2
3875; GFX7-NEXT:    s_endpgm
3876;
3877; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
3878; GFX10-WGP:       ; %bb.0: ; %entry
3879; GFX10-WGP-NEXT:    s_clause 0x1
3880; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3881; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3882; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3883; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
3884; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
3885; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3886; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3887; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3888; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3889; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3890; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3891; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3892; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3893; GFX10-WGP-NEXT:    buffer_gl0_inv
3894; GFX10-WGP-NEXT:    buffer_gl1_inv
3895; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3896; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3897; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3898; GFX10-WGP-NEXT:    s_endpgm
3899;
3900; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
3901; GFX10-CU:       ; %bb.0: ; %entry
3902; GFX10-CU-NEXT:    s_clause 0x1
3903; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3904; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3905; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3906; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
3907; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
3908; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3909; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3910; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3911; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3912; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3913; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3914; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3915; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3916; GFX10-CU-NEXT:    buffer_gl0_inv
3917; GFX10-CU-NEXT:    buffer_gl1_inv
3918; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3919; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3920; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3921; GFX10-CU-NEXT:    s_endpgm
3922;
3923; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg:
3924; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3925; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3926; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3927; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3928; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
3929; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
3930; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3931; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3932; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
3933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3934; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3935; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3936; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3937; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3938; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3939; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3940; SKIP-CACHE-INV-NEXT:    s_endpgm
3941;
3942; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
3943; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3944; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3945; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3946; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3947; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3948; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3949; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3950; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3951; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3952; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3953; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3954; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3955; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3956; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3957;
3958; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
3959; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3960; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3961; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3962; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3963; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
3964; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
3965; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3966; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3967; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3968; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3969; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3970; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3971; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
3972; GFX90A-TGSPLIT-NEXT:    s_endpgm
3973    i32* %out, i32 %in, i32 %old) {
3974entry:
3975  %gep = getelementptr i32, i32* %out, i32 4
3976  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire
3977  %val0 = extractvalue { i32, i1 } %val, 0
3978  store i32 %val0, i32* %out, align 4
3979  ret void
3980}
3981
3982define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
3983; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
3984; GFX7:       ; %bb.0: ; %entry
3985; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3986; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3987; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3988; GFX7-NEXT:    s_add_u32 s4, s0, 16
3989; GFX7-NEXT:    s_addc_u32 s5, s1, 0
3990; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3991; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3992; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3993; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3994; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3995; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3996; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3997; GFX7-NEXT:    buffer_wbinvl1_vol
3998; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3999; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4000; GFX7-NEXT:    flat_store_dword v[0:1], v2
4001; GFX7-NEXT:    s_endpgm
4002;
4003; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
4004; GFX10-WGP:       ; %bb.0: ; %entry
4005; GFX10-WGP-NEXT:    s_clause 0x1
4006; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4007; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4008; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4009; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4010; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4011; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4012; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4013; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4014; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4015; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4016; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4017; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4018; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4019; GFX10-WGP-NEXT:    buffer_gl0_inv
4020; GFX10-WGP-NEXT:    buffer_gl1_inv
4021; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4022; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4023; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4024; GFX10-WGP-NEXT:    s_endpgm
4025;
4026; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
4027; GFX10-CU:       ; %bb.0: ; %entry
4028; GFX10-CU-NEXT:    s_clause 0x1
4029; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4030; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4031; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4032; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4033; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4034; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4035; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4036; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4037; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4038; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4039; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4040; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4041; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4042; GFX10-CU-NEXT:    buffer_gl0_inv
4043; GFX10-CU-NEXT:    buffer_gl1_inv
4044; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4045; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4046; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4047; GFX10-CU-NEXT:    s_endpgm
4048;
4049; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
4050; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4051; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4052; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4053; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4054; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4055; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4056; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4059; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4060; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4061; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4062; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4064; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4065; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4066; SKIP-CACHE-INV-NEXT:    s_endpgm
4067;
4068; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
4069; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4070; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4071; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4072; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4073; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4074; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4075; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4076; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4077; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4078; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4079; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4080; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4081; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4082; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4083;
4084; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
4085; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4086; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4087; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4088; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4089; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4090; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4091; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4092; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4093; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4094; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4095; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4096; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4097; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4098; GFX90A-TGSPLIT-NEXT:    s_endpgm
4099    i32* %out, i32 %in, i32 %old) {
4100entry:
4101  %gep = getelementptr i32, i32* %out, i32 4
4102  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire
4103  %val0 = extractvalue { i32, i1 } %val, 0
4104  store i32 %val0, i32* %out, align 4
4105  ret void
4106}
4107
4108define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
4109; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4110; GFX7:       ; %bb.0: ; %entry
4111; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4112; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4114; GFX7-NEXT:    s_add_u32 s4, s0, 16
4115; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4116; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4117; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4118; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4119; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4120; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4121; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4122; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4123; GFX7-NEXT:    buffer_wbinvl1_vol
4124; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4125; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4126; GFX7-NEXT:    flat_store_dword v[0:1], v2
4127; GFX7-NEXT:    s_endpgm
4128;
4129; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4130; GFX10-WGP:       ; %bb.0: ; %entry
4131; GFX10-WGP-NEXT:    s_clause 0x1
4132; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4133; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4134; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4135; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4136; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4137; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4138; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4139; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4140; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4141; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4142; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4143; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4144; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4145; GFX10-WGP-NEXT:    buffer_gl0_inv
4146; GFX10-WGP-NEXT:    buffer_gl1_inv
4147; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4148; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4149; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4150; GFX10-WGP-NEXT:    s_endpgm
4151;
4152; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4153; GFX10-CU:       ; %bb.0: ; %entry
4154; GFX10-CU-NEXT:    s_clause 0x1
4155; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4156; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4157; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4158; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4159; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4160; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4161; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4163; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4164; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4165; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4166; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4167; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4168; GFX10-CU-NEXT:    buffer_gl0_inv
4169; GFX10-CU-NEXT:    buffer_gl1_inv
4170; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4171; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4172; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4173; GFX10-CU-NEXT:    s_endpgm
4174;
4175; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4176; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4177; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4178; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4179; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4180; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4181; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4182; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4184; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4185; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4186; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4187; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4188; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4191; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4192; SKIP-CACHE-INV-NEXT:    s_endpgm
4193;
4194; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4195; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4196; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4198; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4199; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4200; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4201; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4203; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4204; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4205; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4206; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4207; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4208; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4209;
4210; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
4211; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4212; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4213; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4214; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4215; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4216; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4217; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4218; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4219; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4220; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4221; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4222; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4223; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4224; GFX90A-TGSPLIT-NEXT:    s_endpgm
4225    i32* %out, i32 %in, i32 %old) {
4226entry:
4227  %gep = getelementptr i32, i32* %out, i32 4
4228  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire
4229  %val0 = extractvalue { i32, i1 } %val, 0
4230  store i32 %val0, i32* %out, align 4
4231  ret void
4232}
4233
4234define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
4235; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4236; GFX7:       ; %bb.0: ; %entry
4237; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4238; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4239; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4240; GFX7-NEXT:    s_add_u32 s4, s0, 16
4241; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4242; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4243; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4244; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4245; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4246; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4247; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4248; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4249; GFX7-NEXT:    buffer_wbinvl1_vol
4250; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4251; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4252; GFX7-NEXT:    flat_store_dword v[0:1], v2
4253; GFX7-NEXT:    s_endpgm
4254;
4255; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4256; GFX10-WGP:       ; %bb.0: ; %entry
4257; GFX10-WGP-NEXT:    s_clause 0x1
4258; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4259; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4260; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4261; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4262; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4263; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4264; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4265; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4266; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4267; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4268; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4269; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4270; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4271; GFX10-WGP-NEXT:    buffer_gl0_inv
4272; GFX10-WGP-NEXT:    buffer_gl1_inv
4273; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4274; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4275; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4276; GFX10-WGP-NEXT:    s_endpgm
4277;
4278; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4279; GFX10-CU:       ; %bb.0: ; %entry
4280; GFX10-CU-NEXT:    s_clause 0x1
4281; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4282; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4283; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4284; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4285; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4286; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4287; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4288; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4289; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4290; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4291; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4292; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4293; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4294; GFX10-CU-NEXT:    buffer_gl0_inv
4295; GFX10-CU-NEXT:    buffer_gl1_inv
4296; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4297; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4298; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4299; GFX10-CU-NEXT:    s_endpgm
4300;
4301; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4302; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4303; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4304; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4305; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4306; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4307; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4308; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4312; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4313; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4314; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4316; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4317; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4318; SKIP-CACHE-INV-NEXT:    s_endpgm
4319;
4320; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4321; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4322; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4323; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4324; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4325; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4326; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4327; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4328; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4329; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4330; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4331; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4332; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4333; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4334; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4335;
4336; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
4337; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4338; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4339; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4340; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4341; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4342; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4343; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4344; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4345; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4346; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4347; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4348; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4349; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4350; GFX90A-TGSPLIT-NEXT:    s_endpgm
4351    i32* %out, i32 %in, i32 %old) {
4352entry:
4353  %gep = getelementptr i32, i32* %out, i32 4
4354  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst
4355  %val0 = extractvalue { i32, i1 } %val, 0
4356  store i32 %val0, i32* %out, align 4
4357  ret void
4358}
4359
4360define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
4361; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4362; GFX7:       ; %bb.0: ; %entry
4363; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4364; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4365; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4366; GFX7-NEXT:    s_add_u32 s4, s0, 16
4367; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4368; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4369; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4370; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4371; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4372; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4373; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4374; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4375; GFX7-NEXT:    buffer_wbinvl1_vol
4376; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4377; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4378; GFX7-NEXT:    flat_store_dword v[0:1], v2
4379; GFX7-NEXT:    s_endpgm
4380;
4381; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4382; GFX10-WGP:       ; %bb.0: ; %entry
4383; GFX10-WGP-NEXT:    s_clause 0x1
4384; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4385; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4386; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4387; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4388; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4389; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4390; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4391; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4392; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4393; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4394; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4395; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4396; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4397; GFX10-WGP-NEXT:    buffer_gl0_inv
4398; GFX10-WGP-NEXT:    buffer_gl1_inv
4399; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4400; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4401; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4402; GFX10-WGP-NEXT:    s_endpgm
4403;
4404; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4405; GFX10-CU:       ; %bb.0: ; %entry
4406; GFX10-CU-NEXT:    s_clause 0x1
4407; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4408; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4409; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4410; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4411; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4412; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4413; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4414; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4415; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4416; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4417; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4418; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4419; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4420; GFX10-CU-NEXT:    buffer_gl0_inv
4421; GFX10-CU-NEXT:    buffer_gl1_inv
4422; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4423; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4424; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4425; GFX10-CU-NEXT:    s_endpgm
4426;
4427; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4428; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4429; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4431; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4432; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4433; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4434; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4437; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4438; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4439; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4440; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4443; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4444; SKIP-CACHE-INV-NEXT:    s_endpgm
4445;
4446; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4447; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4448; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4449; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4450; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4451; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4452; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4453; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4454; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4455; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4456; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4457; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4458; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4459; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4460; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4461;
4462; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
4463; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4464; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4465; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4466; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4468; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4469; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4470; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4471; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4472; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4473; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4474; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4475; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4476; GFX90A-TGSPLIT-NEXT:    s_endpgm
4477    i32* %out, i32 %in, i32 %old) {
4478entry:
4479  %gep = getelementptr i32, i32* %out, i32 4
4480  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst
4481  %val0 = extractvalue { i32, i1 } %val, 0
4482  store i32 %val0, i32* %out, align 4
4483  ret void
4484}
4485
4486define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
4487; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4488; GFX7:       ; %bb.0: ; %entry
4489; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4490; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4491; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4492; GFX7-NEXT:    s_add_u32 s4, s0, 16
4493; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4494; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4495; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4496; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4497; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4498; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4499; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4500; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4501; GFX7-NEXT:    buffer_wbinvl1_vol
4502; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4503; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4504; GFX7-NEXT:    flat_store_dword v[0:1], v2
4505; GFX7-NEXT:    s_endpgm
4506;
4507; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4508; GFX10-WGP:       ; %bb.0: ; %entry
4509; GFX10-WGP-NEXT:    s_clause 0x1
4510; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4511; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4512; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4513; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4514; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4515; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4516; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4517; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4518; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4519; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4520; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4521; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4522; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4523; GFX10-WGP-NEXT:    buffer_gl0_inv
4524; GFX10-WGP-NEXT:    buffer_gl1_inv
4525; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4526; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4527; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4528; GFX10-WGP-NEXT:    s_endpgm
4529;
4530; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4531; GFX10-CU:       ; %bb.0: ; %entry
4532; GFX10-CU-NEXT:    s_clause 0x1
4533; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4534; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4535; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4536; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4537; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4538; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4539; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4540; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4541; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4542; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4543; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4544; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4545; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4546; GFX10-CU-NEXT:    buffer_gl0_inv
4547; GFX10-CU-NEXT:    buffer_gl1_inv
4548; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4549; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4550; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4551; GFX10-CU-NEXT:    s_endpgm
4552;
4553; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4554; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4555; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4556; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4557; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4558; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4559; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4560; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4561; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4564; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4565; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4566; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4569; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4570; SKIP-CACHE-INV-NEXT:    s_endpgm
4571;
4572; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4573; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4574; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4575; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4576; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4577; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4578; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4579; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4581; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4582; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4583; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4584; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4585; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4586; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4587;
4588; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
4589; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4590; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4591; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4592; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4593; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4594; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4595; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4596; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4597; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4598; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4599; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4600; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4601; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4602; GFX90A-TGSPLIT-NEXT:    s_endpgm
4603    i32* %out, i32 %in, i32 %old) {
4604entry:
4605  %gep = getelementptr i32, i32* %out, i32 4
4606  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst
4607  %val0 = extractvalue { i32, i1 } %val, 0
4608  store i32 %val0, i32* %out, align 4
4609  ret void
4610}
4611
4612define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
4613; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4614; GFX7:       ; %bb.0: ; %entry
4615; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4616; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4617; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4618; GFX7-NEXT:    s_add_u32 s4, s0, 16
4619; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4620; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4621; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4622; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4623; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4624; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4625; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4626; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4627; GFX7-NEXT:    buffer_wbinvl1_vol
4628; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4629; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4630; GFX7-NEXT:    flat_store_dword v[0:1], v2
4631; GFX7-NEXT:    s_endpgm
4632;
4633; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4634; GFX10-WGP:       ; %bb.0: ; %entry
4635; GFX10-WGP-NEXT:    s_clause 0x1
4636; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4637; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4638; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4639; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4640; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4641; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4642; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4643; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4644; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4645; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4646; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4647; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4648; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4649; GFX10-WGP-NEXT:    buffer_gl0_inv
4650; GFX10-WGP-NEXT:    buffer_gl1_inv
4651; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4652; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4653; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4654; GFX10-WGP-NEXT:    s_endpgm
4655;
4656; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4657; GFX10-CU:       ; %bb.0: ; %entry
4658; GFX10-CU-NEXT:    s_clause 0x1
4659; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4660; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4661; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4662; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4663; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4664; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4665; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4666; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4667; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4668; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4669; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4670; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4671; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4672; GFX10-CU-NEXT:    buffer_gl0_inv
4673; GFX10-CU-NEXT:    buffer_gl1_inv
4674; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4675; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4676; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4677; GFX10-CU-NEXT:    s_endpgm
4678;
4679; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4680; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4681; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4682; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4683; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4684; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4685; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4686; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4688; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4690; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4691; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4692; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4693; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4695; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4696; SKIP-CACHE-INV-NEXT:    s_endpgm
4697;
4698; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4699; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4700; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4701; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4702; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4703; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4704; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4705; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4706; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4707; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4708; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4709; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4710; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4711; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4712; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4713;
4714; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
4715; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4716; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4717; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4718; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4719; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4720; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4721; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4722; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4723; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4724; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4725; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4726; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4727; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4728; GFX90A-TGSPLIT-NEXT:    s_endpgm
4729    i32* %out, i32 %in, i32 %old) {
4730entry:
4731  %gep = getelementptr i32, i32* %out, i32 4
4732  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst
4733  %val0 = extractvalue { i32, i1 } %val, 0
4734  store i32 %val0, i32* %out, align 4
4735  ret void
4736}
4737
4738define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
4739; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4740; GFX7:       ; %bb.0: ; %entry
4741; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4742; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4743; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4744; GFX7-NEXT:    s_add_u32 s4, s0, 16
4745; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4746; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4747; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4748; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4749; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4750; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4751; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4752; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4753; GFX7-NEXT:    buffer_wbinvl1_vol
4754; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4755; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4756; GFX7-NEXT:    flat_store_dword v[0:1], v2
4757; GFX7-NEXT:    s_endpgm
4758;
4759; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4760; GFX10-WGP:       ; %bb.0: ; %entry
4761; GFX10-WGP-NEXT:    s_clause 0x1
4762; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4763; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4764; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4765; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4766; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4767; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4768; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4769; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4770; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4771; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4772; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4773; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4774; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4775; GFX10-WGP-NEXT:    buffer_gl0_inv
4776; GFX10-WGP-NEXT:    buffer_gl1_inv
4777; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4778; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4779; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4780; GFX10-WGP-NEXT:    s_endpgm
4781;
4782; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4783; GFX10-CU:       ; %bb.0: ; %entry
4784; GFX10-CU-NEXT:    s_clause 0x1
4785; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4786; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4787; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4788; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4789; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4790; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4791; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4792; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4793; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4794; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4795; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4796; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4797; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4798; GFX10-CU-NEXT:    buffer_gl0_inv
4799; GFX10-CU-NEXT:    buffer_gl1_inv
4800; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4801; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4802; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4803; GFX10-CU-NEXT:    s_endpgm
4804;
4805; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4806; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4807; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4808; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4810; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4811; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4816; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4817; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4818; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4819; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4820; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4821; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4822; SKIP-CACHE-INV-NEXT:    s_endpgm
4823;
4824; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4825; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4826; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4827; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4828; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4829; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4830; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4831; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4832; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4833; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4834; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4835; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4836; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4837; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4838; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4839;
4840; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
4841; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4842; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4843; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4844; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4845; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
4846; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
4847; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4848; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4849; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4850; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4851; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4852; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4853; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
4854; GFX90A-TGSPLIT-NEXT:    s_endpgm
4855    i32* %out, i32 %in, i32 %old) {
4856entry:
4857  %gep = getelementptr i32, i32* %out, i32 4
4858  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
4859  %val0 = extractvalue { i32, i1 } %val, 0
4860  store i32 %val0, i32* %out, align 4
4861  ret void
4862}
4863
4864define amdgpu_kernel void @flat_system_one_as_unordered_load(
4865; GFX7-LABEL: flat_system_one_as_unordered_load:
4866; GFX7:       ; %bb.0: ; %entry
4867; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4868; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4869; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4870; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4871; GFX7-NEXT:    flat_load_dword v0, v[0:1]
4872; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4873; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4874; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4875; GFX7-NEXT:    flat_store_dword v[2:3], v0
4876; GFX7-NEXT:    s_endpgm
4877;
4878; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
4879; GFX10-WGP:       ; %bb.0: ; %entry
4880; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4881; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4882; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4883; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4884; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
4885; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4886; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4887; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4888; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4889; GFX10-WGP-NEXT:    s_endpgm
4890;
4891; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
4892; GFX10-CU:       ; %bb.0: ; %entry
4893; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4894; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4895; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4896; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4897; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
4898; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4899; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4900; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4901; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4902; GFX10-CU-NEXT:    s_endpgm
4903;
4904; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load:
4905; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4906; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4907; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4908; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4909; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4910; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
4911; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4913; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4914; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
4915; SKIP-CACHE-INV-NEXT:    s_endpgm
4916;
4917; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
4918; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4919; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4920; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4921; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4922; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4923; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4924; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4925; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4926; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4927; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4928; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4929;
4930; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
4931; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4932; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4933; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4934; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
4935; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
4936; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
4937; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
4938; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
4939; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4940; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
4941; GFX90A-TGSPLIT-NEXT:    s_endpgm
4942    i32* %in, i32* %out) {
4943entry:
4944  %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
4945  store i32 %val, i32* %out
4946  ret void
4947}
4948
4949define amdgpu_kernel void @flat_system_one_as_monotonic_load(
4950; GFX7-LABEL: flat_system_one_as_monotonic_load:
4951; GFX7:       ; %bb.0: ; %entry
4952; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4953; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4954; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4955; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4956; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
4957; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4958; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4959; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4960; GFX7-NEXT:    flat_store_dword v[2:3], v0
4961; GFX7-NEXT:    s_endpgm
4962;
4963; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
4964; GFX10-WGP:       ; %bb.0: ; %entry
4965; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4966; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4967; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4968; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4969; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
4970; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
4971; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
4972; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4973; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4974; GFX10-WGP-NEXT:    s_endpgm
4975;
4976; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
4977; GFX10-CU:       ; %bb.0: ; %entry
4978; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4979; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4981; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4982; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
4983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
4984; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
4985; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4986; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4987; GFX10-CU-NEXT:    s_endpgm
4988;
4989; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load:
4990; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4991; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4992; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4993; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4994; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4995; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
4996; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
4997; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
4998; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4999; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
5000; SKIP-CACHE-INV-NEXT:    s_endpgm
5001;
5002; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
5003; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5004; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5005; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5006; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5007; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5008; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5009; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5010; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5011; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5012; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5013; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5014;
5015; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
5016; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5017; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5018; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5019; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5020; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5021; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5022; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5023; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5024; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5025; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5026; GFX90A-TGSPLIT-NEXT:    s_endpgm
5027    i32* %in, i32* %out) {
5028entry:
5029  %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
5030  store i32 %val, i32* %out
5031  ret void
5032}
5033
5034define amdgpu_kernel void @flat_system_one_as_acquire_load(
5035; GFX7-LABEL: flat_system_one_as_acquire_load:
5036; GFX7:       ; %bb.0: ; %entry
5037; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5038; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5039; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5040; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5041; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
5042; GFX7-NEXT:    s_waitcnt vmcnt(0)
5043; GFX7-NEXT:    buffer_wbinvl1_vol
5044; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5045; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5046; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5047; GFX7-NEXT:    flat_store_dword v[2:3], v0
5048; GFX7-NEXT:    s_endpgm
5049;
5050; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
5051; GFX10-WGP:       ; %bb.0: ; %entry
5052; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5053; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5055; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5056; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
5057; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5058; GFX10-WGP-NEXT:    buffer_gl0_inv
5059; GFX10-WGP-NEXT:    buffer_gl1_inv
5060; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
5061; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
5062; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5063; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5064; GFX10-WGP-NEXT:    s_endpgm
5065;
5066; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
5067; GFX10-CU:       ; %bb.0: ; %entry
5068; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5069; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5070; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5071; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5072; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
5073; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5074; GFX10-CU-NEXT:    buffer_gl0_inv
5075; GFX10-CU-NEXT:    buffer_gl1_inv
5076; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
5077; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
5078; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5079; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5080; GFX10-CU-NEXT:    s_endpgm
5081;
5082; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load:
5083; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5084; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5085; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5086; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5088; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
5089; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
5092; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5093; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
5094; SKIP-CACHE-INV-NEXT:    s_endpgm
5095;
5096; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
5097; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5098; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5099; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5100; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5101; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5102; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5103; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5104; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5105; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5106; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5107; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5109; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5110; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5111;
5112; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
5113; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5114; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5115; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5116; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5117; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5118; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5119; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5120; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5121; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5122; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5123; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5124; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5125; GFX90A-TGSPLIT-NEXT:    s_endpgm
5126    i32* %in, i32* %out) {
5127entry:
5128  %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
5129  store i32 %val, i32* %out
5130  ret void
5131}
5132
5133define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
5134; GFX7-LABEL: flat_system_one_as_seq_cst_load:
5135; GFX7:       ; %bb.0: ; %entry
5136; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5138; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5139; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5140; GFX7-NEXT:    s_waitcnt vmcnt(0)
5141; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
5142; GFX7-NEXT:    s_waitcnt vmcnt(0)
5143; GFX7-NEXT:    buffer_wbinvl1_vol
5144; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5145; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5146; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5147; GFX7-NEXT:    flat_store_dword v[2:3], v0
5148; GFX7-NEXT:    s_endpgm
5149;
5150; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
5151; GFX10-WGP:       ; %bb.0: ; %entry
5152; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5153; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5154; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5155; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5156; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5157; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5158; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
5159; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5160; GFX10-WGP-NEXT:    buffer_gl0_inv
5161; GFX10-WGP-NEXT:    buffer_gl1_inv
5162; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
5163; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
5164; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5165; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5166; GFX10-WGP-NEXT:    s_endpgm
5167;
5168; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
5169; GFX10-CU:       ; %bb.0: ; %entry
5170; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5171; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5172; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5173; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5174; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5175; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5176; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
5177; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5178; GFX10-CU-NEXT:    buffer_gl0_inv
5179; GFX10-CU-NEXT:    buffer_gl1_inv
5180; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
5181; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
5182; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5183; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5184; GFX10-CU-NEXT:    s_endpgm
5185;
5186; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load:
5187; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5188; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5189; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5191; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5192; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5193; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
5194; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5195; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5196; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
5197; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5198; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
5199; SKIP-CACHE-INV-NEXT:    s_endpgm
5200;
5201; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
5202; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5203; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5204; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5205; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5206; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5207; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5208; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5209; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5210; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5211; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5212; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5213; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5214; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5215; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5216; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5217;
5218; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
5219; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5220; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
5221; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5222; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
5223; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
5224; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5225; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
5226; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5227; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5228; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5229; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5230; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
5231; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
5232; GFX90A-TGSPLIT-NEXT:    s_endpgm
5233    i32* %in, i32* %out) {
5234entry:
5235  %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
5236  store i32 %val, i32* %out
5237  ret void
5238}
5239
5240define amdgpu_kernel void @flat_system_one_as_unordered_store(
5241; GFX7-LABEL: flat_system_one_as_unordered_store:
5242; GFX7:       ; %bb.0: ; %entry
5243; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
5244; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
5245; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5246; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5247; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5248; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5249; GFX7-NEXT:    flat_store_dword v[0:1], v2
5250; GFX7-NEXT:    s_endpgm
5251;
5252; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
5253; GFX10-WGP:       ; %bb.0: ; %entry
5254; GFX10-WGP-NEXT:    s_clause 0x1
5255; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5256; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
5257; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5258; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5259; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5260; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5261; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5262; GFX10-WGP-NEXT:    s_endpgm
5263;
5264; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
5265; GFX10-CU:       ; %bb.0: ; %entry
5266; GFX10-CU-NEXT:    s_clause 0x1
5267; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5268; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
5269; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5270; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5271; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5272; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5273; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5274; GFX10-CU-NEXT:    s_endpgm
5275;
5276; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store:
5277; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5278; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
5279; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5280; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5282; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5283; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5284; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5285; SKIP-CACHE-INV-NEXT:    s_endpgm
5286;
5287; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
5288; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5289; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5290; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5291; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5292; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5293; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5294; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5295; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5296;
5297; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
5298; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5299; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5300; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5301; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5302; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5303; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5304; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5305; GFX90A-TGSPLIT-NEXT:    s_endpgm
5306    i32 %in, i32* %out) {
5307entry:
5308  store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
5309  ret void
5310}
5311
5312define amdgpu_kernel void @flat_system_one_as_monotonic_store(
5313; GFX7-LABEL: flat_system_one_as_monotonic_store:
5314; GFX7:       ; %bb.0: ; %entry
5315; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
5316; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
5317; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5318; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5319; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5320; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5321; GFX7-NEXT:    flat_store_dword v[0:1], v2
5322; GFX7-NEXT:    s_endpgm
5323;
5324; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
5325; GFX10-WGP:       ; %bb.0: ; %entry
5326; GFX10-WGP-NEXT:    s_clause 0x1
5327; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5328; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
5329; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5330; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5331; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5332; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5333; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5334; GFX10-WGP-NEXT:    s_endpgm
5335;
5336; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
5337; GFX10-CU:       ; %bb.0: ; %entry
5338; GFX10-CU-NEXT:    s_clause 0x1
5339; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5340; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
5341; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5342; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5343; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5344; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5345; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5346; GFX10-CU-NEXT:    s_endpgm
5347;
5348; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store:
5349; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5350; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
5351; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5352; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5355; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5356; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5357; SKIP-CACHE-INV-NEXT:    s_endpgm
5358;
5359; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
5360; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5361; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5362; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5363; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5364; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5365; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5366; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5367; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5368;
5369; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
5370; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5371; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5372; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5373; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5374; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5375; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5376; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5377; GFX90A-TGSPLIT-NEXT:    s_endpgm
5378    i32 %in, i32* %out) {
5379entry:
5380  store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
5381  ret void
5382}
5383
5384define amdgpu_kernel void @flat_system_one_as_release_store(
5385; GFX7-LABEL: flat_system_one_as_release_store:
5386; GFX7:       ; %bb.0: ; %entry
5387; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
5388; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
5389; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5390; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5391; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5392; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5393; GFX7-NEXT:    s_waitcnt vmcnt(0)
5394; GFX7-NEXT:    flat_store_dword v[0:1], v2
5395; GFX7-NEXT:    s_endpgm
5396;
5397; GFX10-WGP-LABEL: flat_system_one_as_release_store:
5398; GFX10-WGP:       ; %bb.0: ; %entry
5399; GFX10-WGP-NEXT:    s_clause 0x1
5400; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5401; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
5402; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5403; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5404; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5405; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5406; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5407; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5408; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5409; GFX10-WGP-NEXT:    s_endpgm
5410;
5411; GFX10-CU-LABEL: flat_system_one_as_release_store:
5412; GFX10-CU:       ; %bb.0: ; %entry
5413; GFX10-CU-NEXT:    s_clause 0x1
5414; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5415; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
5416; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5418; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5419; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5420; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5421; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5422; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5423; GFX10-CU-NEXT:    s_endpgm
5424;
5425; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store:
5426; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5427; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
5428; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5429; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5432; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5433; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5434; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5435; SKIP-CACHE-INV-NEXT:    s_endpgm
5436;
5437; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
5438; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5439; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5440; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5441; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5442; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5443; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5444; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5445; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5446; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5447; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5448;
5449; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
5450; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5451; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5452; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5453; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5454; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5455; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5456; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5457; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5458; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5459; GFX90A-TGSPLIT-NEXT:    s_endpgm
5460    i32 %in, i32* %out) {
5461entry:
5462  store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
5463  ret void
5464}
5465
5466define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
5467; GFX7-LABEL: flat_system_one_as_seq_cst_store:
5468; GFX7:       ; %bb.0: ; %entry
5469; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
5470; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
5471; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5472; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5473; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5474; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5475; GFX7-NEXT:    s_waitcnt vmcnt(0)
5476; GFX7-NEXT:    flat_store_dword v[0:1], v2
5477; GFX7-NEXT:    s_endpgm
5478;
5479; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
5480; GFX10-WGP:       ; %bb.0: ; %entry
5481; GFX10-WGP-NEXT:    s_clause 0x1
5482; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5483; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
5484; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5485; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5486; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5487; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5488; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5489; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5490; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5491; GFX10-WGP-NEXT:    s_endpgm
5492;
5493; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
5494; GFX10-CU:       ; %bb.0: ; %entry
5495; GFX10-CU-NEXT:    s_clause 0x1
5496; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5497; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
5498; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5499; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5500; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5501; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5502; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5503; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5504; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5505; GFX10-CU-NEXT:    s_endpgm
5506;
5507; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store:
5508; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5509; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
5510; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5511; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5512; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
5513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5515; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5516; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5517; SKIP-CACHE-INV-NEXT:    s_endpgm
5518;
5519; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
5520; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5521; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5522; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5523; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5524; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5525; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5526; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5527; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5528; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5529; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5530;
5531; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
5532; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5533; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
5534; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5535; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5536; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5537; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5538; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5539; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5540; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5541; GFX90A-TGSPLIT-NEXT:    s_endpgm
5542    i32 %in, i32* %out) {
5543entry:
5544  store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
5545  ret void
5546}
5547
5548define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
5549; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
5550; GFX7:       ; %bb.0: ; %entry
5551; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5552; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5553; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5554; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5555; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5556; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5557; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5558; GFX7-NEXT:    s_endpgm
5559;
5560; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
5561; GFX10-WGP:       ; %bb.0: ; %entry
5562; GFX10-WGP-NEXT:    s_clause 0x1
5563; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5564; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5566; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5567; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5568; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5569; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5570; GFX10-WGP-NEXT:    s_endpgm
5571;
5572; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
5573; GFX10-CU:       ; %bb.0: ; %entry
5574; GFX10-CU-NEXT:    s_clause 0x1
5575; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5576; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5577; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5578; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5579; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5580; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5581; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5582; GFX10-CU-NEXT:    s_endpgm
5583;
5584; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw:
5585; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5586; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5587; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5588; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5592; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5593; SKIP-CACHE-INV-NEXT:    s_endpgm
5594;
5595; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
5596; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5597; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5598; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5599; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5600; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5601; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5602; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5603; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5604;
5605; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
5606; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5607; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5608; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5609; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5610; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5611; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5612; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5613; GFX90A-TGSPLIT-NEXT:    s_endpgm
5614    i32* %out, i32 %in) {
5615entry:
5616  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic
5617  ret void
5618}
5619
5620define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
5621; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
5622; GFX7:       ; %bb.0: ; %entry
5623; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5624; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5625; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5626; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5627; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5628; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5629; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5630; GFX7-NEXT:    s_waitcnt vmcnt(0)
5631; GFX7-NEXT:    buffer_wbinvl1_vol
5632; GFX7-NEXT:    s_endpgm
5633;
5634; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
5635; GFX10-WGP:       ; %bb.0: ; %entry
5636; GFX10-WGP-NEXT:    s_clause 0x1
5637; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5638; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5639; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5640; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5641; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5642; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5643; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5644; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5645; GFX10-WGP-NEXT:    buffer_gl0_inv
5646; GFX10-WGP-NEXT:    buffer_gl1_inv
5647; GFX10-WGP-NEXT:    s_endpgm
5648;
5649; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
5650; GFX10-CU:       ; %bb.0: ; %entry
5651; GFX10-CU-NEXT:    s_clause 0x1
5652; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5653; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5654; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5655; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5656; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5657; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5658; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5659; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5660; GFX10-CU-NEXT:    buffer_gl0_inv
5661; GFX10-CU-NEXT:    buffer_gl1_inv
5662; GFX10-CU-NEXT:    s_endpgm
5663;
5664; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw:
5665; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5666; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5667; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5668; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5669; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5670; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5671; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5672; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5673; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5674; SKIP-CACHE-INV-NEXT:    s_endpgm
5675;
5676; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
5677; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5678; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5679; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5680; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5681; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5682; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5683; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5684; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5685; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5686; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5687; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5688;
5689; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
5690; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5691; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5692; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5693; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5694; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5695; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5696; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5697; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5698; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5699; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5700; GFX90A-TGSPLIT-NEXT:    s_endpgm
5701    i32* %out, i32 %in) {
5702entry:
5703  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
5704  ret void
5705}
5706
5707define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
5708; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
5709; GFX7:       ; %bb.0: ; %entry
5710; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5711; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5712; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5713; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5714; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5715; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5716; GFX7-NEXT:    s_waitcnt vmcnt(0)
5717; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5718; GFX7-NEXT:    s_endpgm
5719;
5720; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
5721; GFX10-WGP:       ; %bb.0: ; %entry
5722; GFX10-WGP-NEXT:    s_clause 0x1
5723; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5724; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5725; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5726; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5727; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5728; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5729; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5730; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5731; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5732; GFX10-WGP-NEXT:    s_endpgm
5733;
5734; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
5735; GFX10-CU:       ; %bb.0: ; %entry
5736; GFX10-CU-NEXT:    s_clause 0x1
5737; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5738; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5739; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5740; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5741; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5742; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5743; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5744; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5745; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5746; GFX10-CU-NEXT:    s_endpgm
5747;
5748; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw:
5749; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5750; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5751; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5752; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5754; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5756; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5757; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5758; SKIP-CACHE-INV-NEXT:    s_endpgm
5759;
5760; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
5761; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5762; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5763; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5764; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5765; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5766; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5767; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5769; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5770; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5771;
5772; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
5773; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5774; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5775; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5776; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5777; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5778; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5779; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5780; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5781; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5782; GFX90A-TGSPLIT-NEXT:    s_endpgm
5783    i32* %out, i32 %in) {
5784entry:
5785  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release
5786  ret void
5787}
5788
5789define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
5790; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5791; GFX7:       ; %bb.0: ; %entry
5792; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5793; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5794; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5795; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5796; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5797; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5798; GFX7-NEXT:    s_waitcnt vmcnt(0)
5799; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5800; GFX7-NEXT:    s_waitcnt vmcnt(0)
5801; GFX7-NEXT:    buffer_wbinvl1_vol
5802; GFX7-NEXT:    s_endpgm
5803;
5804; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5805; GFX10-WGP:       ; %bb.0: ; %entry
5806; GFX10-WGP-NEXT:    s_clause 0x1
5807; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5808; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5809; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5810; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5811; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5812; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5813; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5814; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5815; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5816; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5817; GFX10-WGP-NEXT:    buffer_gl0_inv
5818; GFX10-WGP-NEXT:    buffer_gl1_inv
5819; GFX10-WGP-NEXT:    s_endpgm
5820;
5821; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5822; GFX10-CU:       ; %bb.0: ; %entry
5823; GFX10-CU-NEXT:    s_clause 0x1
5824; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5825; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5826; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5827; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5828; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5829; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5830; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5831; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5832; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5833; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5834; GFX10-CU-NEXT:    buffer_gl0_inv
5835; GFX10-CU-NEXT:    buffer_gl1_inv
5836; GFX10-CU-NEXT:    s_endpgm
5837;
5838; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5839; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5840; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5841; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5842; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5843; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5845; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5846; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5847; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5848; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5849; SKIP-CACHE-INV-NEXT:    s_endpgm
5850;
5851; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5852; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5853; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5854; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5855; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5856; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5857; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5858; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5859; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5860; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5861; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5862; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5863; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5864; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5865;
5866; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
5867; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5868; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5869; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5870; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5871; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5872; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5873; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5874; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5875; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5876; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5877; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5878; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5879; GFX90A-TGSPLIT-NEXT:    s_endpgm
5880    i32* %out, i32 %in) {
5881entry:
5882  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
5883  ret void
5884}
5885
5886define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
5887; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5888; GFX7:       ; %bb.0: ; %entry
5889; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5890; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5891; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5892; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5893; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5894; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5895; GFX7-NEXT:    s_waitcnt vmcnt(0)
5896; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
5897; GFX7-NEXT:    s_waitcnt vmcnt(0)
5898; GFX7-NEXT:    buffer_wbinvl1_vol
5899; GFX7-NEXT:    s_endpgm
5900;
5901; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5902; GFX10-WGP:       ; %bb.0: ; %entry
5903; GFX10-WGP-NEXT:    s_clause 0x1
5904; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5905; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
5906; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5907; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5908; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5909; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5910; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5911; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5912; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
5913; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5914; GFX10-WGP-NEXT:    buffer_gl0_inv
5915; GFX10-WGP-NEXT:    buffer_gl1_inv
5916; GFX10-WGP-NEXT:    s_endpgm
5917;
5918; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5919; GFX10-CU:       ; %bb.0: ; %entry
5920; GFX10-CU-NEXT:    s_clause 0x1
5921; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5922; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
5923; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5924; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5925; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5926; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5927; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5928; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5929; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
5930; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5931; GFX10-CU-NEXT:    buffer_gl0_inv
5932; GFX10-CU-NEXT:    buffer_gl1_inv
5933; GFX10-CU-NEXT:    s_endpgm
5934;
5935; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5936; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5937; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5938; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
5939; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5943; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5944; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
5945; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5946; SKIP-CACHE-INV-NEXT:    s_endpgm
5947;
5948; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5949; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5950; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5951; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5952; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5953; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5954; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5955; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5956; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5957; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5958; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5959; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5960; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5961; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5962;
5963; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
5964; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5965; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5966; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
5967; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5968; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
5969; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
5970; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5971; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5972; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
5973; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5974; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5975; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5976; GFX90A-TGSPLIT-NEXT:    s_endpgm
5977    i32* %out, i32 %in) {
5978entry:
5979  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
5980  ret void
5981}
5982
5983define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
5984; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
5985; GFX7:       ; %bb.0: ; %entry
5986; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5987; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
5988; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5989; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5990; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5991; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5992; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
5993; GFX7-NEXT:    s_waitcnt vmcnt(0)
5994; GFX7-NEXT:    buffer_wbinvl1_vol
5995; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5996; GFX7-NEXT:    flat_store_dword v[0:1], v2
5997; GFX7-NEXT:    s_endpgm
5998;
5999; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
6000; GFX10-WGP:       ; %bb.0: ; %entry
6001; GFX10-WGP-NEXT:    s_clause 0x1
6002; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6003; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
6004; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6005; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6006; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6007; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6008; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6009; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6010; GFX10-WGP-NEXT:    buffer_gl0_inv
6011; GFX10-WGP-NEXT:    buffer_gl1_inv
6012; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6013; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6014; GFX10-WGP-NEXT:    s_endpgm
6015;
6016; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
6017; GFX10-CU:       ; %bb.0: ; %entry
6018; GFX10-CU-NEXT:    s_clause 0x1
6019; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6020; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
6021; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6022; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6023; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6024; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6025; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6026; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6027; GFX10-CU-NEXT:    buffer_gl0_inv
6028; GFX10-CU-NEXT:    buffer_gl1_inv
6029; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6030; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6031; GFX10-CU-NEXT:    s_endpgm
6032;
6033; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
6034; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6035; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6036; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
6037; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6038; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6039; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6040; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6041; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6042; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6043; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6044; SKIP-CACHE-INV-NEXT:    s_endpgm
6045;
6046; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
6047; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6048; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6049; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6050; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6051; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6052; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6053; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6054; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6055; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6056; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6057; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6058; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6059; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6060;
6061; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
6062; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6063; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6064; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6065; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6066; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6067; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6068; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6069; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6070; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6071; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6072; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6073; GFX90A-TGSPLIT-NEXT:    s_endpgm
6074    i32* %out, i32 %in) {
6075entry:
6076  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
6077  store i32 %val, i32* %out, align 4
6078  ret void
6079}
6080
6081define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
6082; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6083; GFX7:       ; %bb.0: ; %entry
6084; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6085; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
6086; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6087; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6088; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6089; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6090; GFX7-NEXT:    s_waitcnt vmcnt(0)
6091; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6092; GFX7-NEXT:    s_waitcnt vmcnt(0)
6093; GFX7-NEXT:    buffer_wbinvl1_vol
6094; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6095; GFX7-NEXT:    flat_store_dword v[0:1], v2
6096; GFX7-NEXT:    s_endpgm
6097;
6098; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6099; GFX10-WGP:       ; %bb.0: ; %entry
6100; GFX10-WGP-NEXT:    s_clause 0x1
6101; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6102; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
6103; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6104; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6105; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6106; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6107; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6108; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6109; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6110; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6111; GFX10-WGP-NEXT:    buffer_gl0_inv
6112; GFX10-WGP-NEXT:    buffer_gl1_inv
6113; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6114; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6115; GFX10-WGP-NEXT:    s_endpgm
6116;
6117; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6118; GFX10-CU:       ; %bb.0: ; %entry
6119; GFX10-CU-NEXT:    s_clause 0x1
6120; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6121; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
6122; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6123; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6124; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6125; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6126; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6127; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6128; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6129; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6130; GFX10-CU-NEXT:    buffer_gl0_inv
6131; GFX10-CU-NEXT:    buffer_gl1_inv
6132; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6133; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6134; GFX10-CU-NEXT:    s_endpgm
6135;
6136; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6137; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6138; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6139; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
6140; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6144; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6145; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6146; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6147; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6148; SKIP-CACHE-INV-NEXT:    s_endpgm
6149;
6150; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6151; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6152; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6153; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6154; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6155; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6156; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6157; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6158; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6159; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6160; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6161; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6162; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6163; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6164; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6165; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6166;
6167; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
6168; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6169; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6170; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6171; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6172; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6173; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6174; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6175; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6176; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6177; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6178; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6179; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6180; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6181; GFX90A-TGSPLIT-NEXT:    s_endpgm
6182    i32* %out, i32 %in) {
6183entry:
6184  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
6185  store i32 %val, i32* %out, align 4
6186  ret void
6187}
6188
6189define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
6190; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6191; GFX7:       ; %bb.0: ; %entry
6192; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6193; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
6194; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6195; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6196; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6197; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6198; GFX7-NEXT:    s_waitcnt vmcnt(0)
6199; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6200; GFX7-NEXT:    s_waitcnt vmcnt(0)
6201; GFX7-NEXT:    buffer_wbinvl1_vol
6202; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6203; GFX7-NEXT:    flat_store_dword v[0:1], v2
6204; GFX7-NEXT:    s_endpgm
6205;
6206; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6207; GFX10-WGP:       ; %bb.0: ; %entry
6208; GFX10-WGP-NEXT:    s_clause 0x1
6209; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6210; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
6211; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6212; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6213; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6214; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6215; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6216; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6217; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6218; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6219; GFX10-WGP-NEXT:    buffer_gl0_inv
6220; GFX10-WGP-NEXT:    buffer_gl1_inv
6221; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6222; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6223; GFX10-WGP-NEXT:    s_endpgm
6224;
6225; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6226; GFX10-CU:       ; %bb.0: ; %entry
6227; GFX10-CU-NEXT:    s_clause 0x1
6228; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6229; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
6230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6233; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6234; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6235; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6236; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6237; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6238; GFX10-CU-NEXT:    buffer_gl0_inv
6239; GFX10-CU-NEXT:    buffer_gl1_inv
6240; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6241; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6242; GFX10-CU-NEXT:    s_endpgm
6243;
6244; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6245; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6246; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6247; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
6248; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6252; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6253; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6254; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6255; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6256; SKIP-CACHE-INV-NEXT:    s_endpgm
6257;
6258; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6259; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6260; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6262; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6263; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6264; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6265; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6266; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6267; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6268; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6269; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6270; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6271; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6272; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6273; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6274;
6275; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
6276; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6277; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6278; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
6279; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6280; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6281; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
6282; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6283; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6284; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
6285; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6286; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6287; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6288; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6289; GFX90A-TGSPLIT-NEXT:    s_endpgm
6290    i32* %out, i32 %in) {
6291entry:
6292  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
6293  store i32 %val, i32* %out, align 4
6294  ret void
6295}
6296
6297define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
6298; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6299; GFX7:       ; %bb.0: ; %entry
6300; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6301; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6302; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6303; GFX7-NEXT:    s_add_u32 s0, s0, 16
6304; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6305; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6306; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6307; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6308; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6309; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6310; GFX7-NEXT:    s_endpgm
6311;
6312; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6313; GFX10-WGP:       ; %bb.0: ; %entry
6314; GFX10-WGP-NEXT:    s_clause 0x1
6315; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6316; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6317; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6318; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6319; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6320; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6321; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6322; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6323; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6324; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6325; GFX10-WGP-NEXT:    s_endpgm
6326;
6327; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6328; GFX10-CU:       ; %bb.0: ; %entry
6329; GFX10-CU-NEXT:    s_clause 0x1
6330; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6331; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6332; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6333; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6334; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6335; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6336; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6337; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6338; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6339; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6340; GFX10-CU-NEXT:    s_endpgm
6341;
6342; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6343; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6344; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6345; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6346; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6347; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6348; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6351; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6352; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6353; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6354; SKIP-CACHE-INV-NEXT:    s_endpgm
6355;
6356; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6357; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6358; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6359; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6360; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6361; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6362; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6363; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6364; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6365;
6366; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
6367; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6368; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6369; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6370; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6371; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6372; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6373; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6374; GFX90A-TGSPLIT-NEXT:    s_endpgm
6375    i32* %out, i32 %in, i32 %old) {
6376entry:
6377  %gep = getelementptr i32, i32* %out, i32 4
6378  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
6379  ret void
6380}
6381
6382define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
6383; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6384; GFX7:       ; %bb.0: ; %entry
6385; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6386; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6387; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6388; GFX7-NEXT:    s_add_u32 s0, s0, 16
6389; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6390; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6391; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6392; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6393; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6394; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6395; GFX7-NEXT:    s_waitcnt vmcnt(0)
6396; GFX7-NEXT:    buffer_wbinvl1_vol
6397; GFX7-NEXT:    s_endpgm
6398;
6399; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6400; GFX10-WGP:       ; %bb.0: ; %entry
6401; GFX10-WGP-NEXT:    s_clause 0x1
6402; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6403; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6404; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6405; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6406; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6407; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6408; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6409; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6410; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6411; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6412; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6413; GFX10-WGP-NEXT:    buffer_gl0_inv
6414; GFX10-WGP-NEXT:    buffer_gl1_inv
6415; GFX10-WGP-NEXT:    s_endpgm
6416;
6417; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6418; GFX10-CU:       ; %bb.0: ; %entry
6419; GFX10-CU-NEXT:    s_clause 0x1
6420; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6421; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6422; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6423; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6424; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6425; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6426; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6427; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6428; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6429; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6430; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6431; GFX10-CU-NEXT:    buffer_gl0_inv
6432; GFX10-CU-NEXT:    buffer_gl1_inv
6433; GFX10-CU-NEXT:    s_endpgm
6434;
6435; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6436; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6437; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6438; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6439; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6440; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6441; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6443; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6444; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6445; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6446; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6447; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6448; SKIP-CACHE-INV-NEXT:    s_endpgm
6449;
6450; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6451; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6452; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6453; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6454; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6455; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6456; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6457; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6458; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6459; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6460; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6461; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6462;
6463; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
6464; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6465; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6466; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6467; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6468; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6469; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6470; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6471; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6472; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6473; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6474; GFX90A-TGSPLIT-NEXT:    s_endpgm
6475    i32* %out, i32 %in, i32 %old) {
6476entry:
6477  %gep = getelementptr i32, i32* %out, i32 4
6478  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
6479  ret void
6480}
6481
6482define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
6483; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6484; GFX7:       ; %bb.0: ; %entry
6485; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6486; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6487; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6488; GFX7-NEXT:    s_add_u32 s0, s0, 16
6489; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6490; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6491; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6492; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6493; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6494; GFX7-NEXT:    s_waitcnt vmcnt(0)
6495; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6496; GFX7-NEXT:    s_endpgm
6497;
6498; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6499; GFX10-WGP:       ; %bb.0: ; %entry
6500; GFX10-WGP-NEXT:    s_clause 0x1
6501; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6502; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6503; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6504; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6505; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6506; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6507; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6508; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6509; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6510; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6511; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6512; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6513; GFX10-WGP-NEXT:    s_endpgm
6514;
6515; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6516; GFX10-CU:       ; %bb.0: ; %entry
6517; GFX10-CU-NEXT:    s_clause 0x1
6518; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6519; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6520; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6521; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6522; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6523; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6524; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6525; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6526; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6527; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6528; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6529; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6530; GFX10-CU-NEXT:    s_endpgm
6531;
6532; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6533; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6534; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6535; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6536; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6537; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6538; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6541; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6543; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6544; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6545; SKIP-CACHE-INV-NEXT:    s_endpgm
6546;
6547; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6548; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6549; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6550; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6551; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6552; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6553; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6554; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6555; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6556; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6557; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6558;
6559; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
6560; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6561; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6562; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6563; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6564; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6565; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6566; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6567; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6568; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6569; GFX90A-TGSPLIT-NEXT:    s_endpgm
6570    i32* %out, i32 %in, i32 %old) {
6571entry:
6572  %gep = getelementptr i32, i32* %out, i32 4
6573  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
6574  ret void
6575}
6576
6577define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
6578; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6579; GFX7:       ; %bb.0: ; %entry
6580; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6581; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6582; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6583; GFX7-NEXT:    s_add_u32 s0, s0, 16
6584; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6585; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6586; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6587; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6588; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6589; GFX7-NEXT:    s_waitcnt vmcnt(0)
6590; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6591; GFX7-NEXT:    s_waitcnt vmcnt(0)
6592; GFX7-NEXT:    buffer_wbinvl1_vol
6593; GFX7-NEXT:    s_endpgm
6594;
6595; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6596; GFX10-WGP:       ; %bb.0: ; %entry
6597; GFX10-WGP-NEXT:    s_clause 0x1
6598; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6599; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6600; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6601; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6602; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6603; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6604; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6605; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6606; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6607; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6608; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6609; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6610; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6611; GFX10-WGP-NEXT:    buffer_gl0_inv
6612; GFX10-WGP-NEXT:    buffer_gl1_inv
6613; GFX10-WGP-NEXT:    s_endpgm
6614;
6615; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6616; GFX10-CU:       ; %bb.0: ; %entry
6617; GFX10-CU-NEXT:    s_clause 0x1
6618; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6619; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6620; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6621; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6622; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6623; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6624; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6625; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6626; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6627; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6628; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6629; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6630; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6631; GFX10-CU-NEXT:    buffer_gl0_inv
6632; GFX10-CU-NEXT:    buffer_gl1_inv
6633; GFX10-CU-NEXT:    s_endpgm
6634;
6635; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6636; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6637; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6638; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6639; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6640; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6641; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6642; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6646; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6647; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6648; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6649; SKIP-CACHE-INV-NEXT:    s_endpgm
6650;
6651; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6652; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6655; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6656; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6657; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6658; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6659; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6660; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6661; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6662; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6663; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6664; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6665;
6666; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
6667; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6668; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6669; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6670; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6671; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6672; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6673; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6674; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6675; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6676; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6677; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6678; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6679; GFX90A-TGSPLIT-NEXT:    s_endpgm
6680    i32* %out, i32 %in, i32 %old) {
6681entry:
6682  %gep = getelementptr i32, i32* %out, i32 4
6683  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
6684  ret void
6685}
6686
6687define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
6688; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6689; GFX7:       ; %bb.0: ; %entry
6690; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6691; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6692; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6693; GFX7-NEXT:    s_add_u32 s0, s0, 16
6694; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6695; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6696; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6697; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6698; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6699; GFX7-NEXT:    s_waitcnt vmcnt(0)
6700; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6701; GFX7-NEXT:    s_waitcnt vmcnt(0)
6702; GFX7-NEXT:    buffer_wbinvl1_vol
6703; GFX7-NEXT:    s_endpgm
6704;
6705; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6706; GFX10-WGP:       ; %bb.0: ; %entry
6707; GFX10-WGP-NEXT:    s_clause 0x1
6708; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6709; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6710; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6711; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6712; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6713; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6714; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6715; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6716; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6717; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6718; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6719; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6720; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6721; GFX10-WGP-NEXT:    buffer_gl0_inv
6722; GFX10-WGP-NEXT:    buffer_gl1_inv
6723; GFX10-WGP-NEXT:    s_endpgm
6724;
6725; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6726; GFX10-CU:       ; %bb.0: ; %entry
6727; GFX10-CU-NEXT:    s_clause 0x1
6728; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6729; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6730; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6731; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6732; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6733; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6734; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6735; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6736; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6737; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6738; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6739; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6740; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6741; GFX10-CU-NEXT:    buffer_gl0_inv
6742; GFX10-CU-NEXT:    buffer_gl1_inv
6743; GFX10-CU-NEXT:    s_endpgm
6744;
6745; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6746; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6747; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6748; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6749; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6750; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6751; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6754; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6756; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6757; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6758; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6759; SKIP-CACHE-INV-NEXT:    s_endpgm
6760;
6761; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6762; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6763; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6764; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6765; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6766; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6767; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6768; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6769; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6770; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6771; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6772; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6773; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6774; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6775;
6776; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
6777; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6778; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6779; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6780; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6781; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6782; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6783; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6784; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6785; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6786; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6787; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6788; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6789; GFX90A-TGSPLIT-NEXT:    s_endpgm
6790    i32* %out, i32 %in, i32 %old) {
6791entry:
6792  %gep = getelementptr i32, i32* %out, i32 4
6793  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
6794  ret void
6795}
6796
6797define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
6798; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6799; GFX7:       ; %bb.0: ; %entry
6800; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6801; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6802; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6803; GFX7-NEXT:    s_add_u32 s0, s0, 16
6804; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6805; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6806; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6807; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6808; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6809; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6810; GFX7-NEXT:    s_waitcnt vmcnt(0)
6811; GFX7-NEXT:    buffer_wbinvl1_vol
6812; GFX7-NEXT:    s_endpgm
6813;
6814; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6815; GFX10-WGP:       ; %bb.0: ; %entry
6816; GFX10-WGP-NEXT:    s_clause 0x1
6817; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6818; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6819; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6820; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6821; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6822; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6823; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6824; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6825; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6826; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6827; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6828; GFX10-WGP-NEXT:    buffer_gl0_inv
6829; GFX10-WGP-NEXT:    buffer_gl1_inv
6830; GFX10-WGP-NEXT:    s_endpgm
6831;
6832; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6833; GFX10-CU:       ; %bb.0: ; %entry
6834; GFX10-CU-NEXT:    s_clause 0x1
6835; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6836; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6837; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6838; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6839; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6840; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6841; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6842; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6843; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6844; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6845; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6846; GFX10-CU-NEXT:    buffer_gl0_inv
6847; GFX10-CU-NEXT:    buffer_gl1_inv
6848; GFX10-CU-NEXT:    s_endpgm
6849;
6850; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6851; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6852; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6853; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6854; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6855; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6856; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6857; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6861; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6862; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6863; SKIP-CACHE-INV-NEXT:    s_endpgm
6864;
6865; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6866; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6867; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6868; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6870; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6871; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6872; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6873; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6874; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6875; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6876; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6877;
6878; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
6879; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6880; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6881; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6882; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6883; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6884; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6885; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6886; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6887; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6888; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6889; GFX90A-TGSPLIT-NEXT:    s_endpgm
6890    i32* %out, i32 %in, i32 %old) {
6891entry:
6892  %gep = getelementptr i32, i32* %out, i32 4
6893  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
6894  ret void
6895}
6896
6897define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
6898; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6899; GFX7:       ; %bb.0: ; %entry
6900; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6901; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
6902; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6903; GFX7-NEXT:    s_add_u32 s0, s0, 16
6904; GFX7-NEXT:    s_addc_u32 s1, s1, 0
6905; GFX7-NEXT:    v_mov_b32_e32 v0, s0
6906; GFX7-NEXT:    v_mov_b32_e32 v2, s2
6907; GFX7-NEXT:    v_mov_b32_e32 v1, s1
6908; GFX7-NEXT:    v_mov_b32_e32 v3, s3
6909; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6910; GFX7-NEXT:    s_waitcnt vmcnt(0)
6911; GFX7-NEXT:    buffer_wbinvl1_vol
6912; GFX7-NEXT:    s_endpgm
6913;
6914; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6915; GFX10-WGP:       ; %bb.0: ; %entry
6916; GFX10-WGP-NEXT:    s_clause 0x1
6917; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6918; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6919; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6920; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
6921; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
6922; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
6923; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
6924; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
6925; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
6926; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6927; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6928; GFX10-WGP-NEXT:    buffer_gl0_inv
6929; GFX10-WGP-NEXT:    buffer_gl1_inv
6930; GFX10-WGP-NEXT:    s_endpgm
6931;
6932; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6933; GFX10-CU:       ; %bb.0: ; %entry
6934; GFX10-CU-NEXT:    s_clause 0x1
6935; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6936; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6937; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6938; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
6939; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
6940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
6941; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
6942; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
6943; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
6944; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6945; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6946; GFX10-CU-NEXT:    buffer_gl0_inv
6947; GFX10-CU-NEXT:    buffer_gl1_inv
6948; GFX10-CU-NEXT:    s_endpgm
6949;
6950; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6951; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6952; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
6953; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6954; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6955; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
6956; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
6957; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6958; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
6959; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6960; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
6961; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6962; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6963; SKIP-CACHE-INV-NEXT:    s_endpgm
6964;
6965; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6966; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6967; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6968; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6969; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6970; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6971; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6972; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6973; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6974; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6975; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6976; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6977;
6978; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
6979; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6980; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6981; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
6982; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6983; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
6984; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
6985; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6986; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6987; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6988; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6989; GFX90A-TGSPLIT-NEXT:    s_endpgm
6990    i32* %out, i32 %in, i32 %old) {
6991entry:
6992  %gep = getelementptr i32, i32* %out, i32 4
6993  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
6994  ret void
6995}
6996
6997define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
6998; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
6999; GFX7:       ; %bb.0: ; %entry
7000; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7001; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7003; GFX7-NEXT:    s_add_u32 s0, s0, 16
7004; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7005; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7006; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7007; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7008; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7009; GFX7-NEXT:    s_waitcnt vmcnt(0)
7010; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7011; GFX7-NEXT:    s_waitcnt vmcnt(0)
7012; GFX7-NEXT:    buffer_wbinvl1_vol
7013; GFX7-NEXT:    s_endpgm
7014;
7015; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
7016; GFX10-WGP:       ; %bb.0: ; %entry
7017; GFX10-WGP-NEXT:    s_clause 0x1
7018; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7019; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7020; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7021; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7022; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7023; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7024; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7025; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7026; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7027; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7028; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7029; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7030; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7031; GFX10-WGP-NEXT:    buffer_gl0_inv
7032; GFX10-WGP-NEXT:    buffer_gl1_inv
7033; GFX10-WGP-NEXT:    s_endpgm
7034;
7035; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
7036; GFX10-CU:       ; %bb.0: ; %entry
7037; GFX10-CU-NEXT:    s_clause 0x1
7038; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7039; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7040; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7041; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7042; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7043; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7044; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7045; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7046; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7047; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7048; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7049; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7050; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7051; GFX10-CU-NEXT:    buffer_gl0_inv
7052; GFX10-CU-NEXT:    buffer_gl1_inv
7053; GFX10-CU-NEXT:    s_endpgm
7054;
7055; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg:
7056; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7057; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7058; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7059; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7060; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7061; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7062; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7064; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7065; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7066; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7067; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7068; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7069; SKIP-CACHE-INV-NEXT:    s_endpgm
7070;
7071; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
7072; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7073; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7074; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7075; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7076; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7077; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7078; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7079; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7080; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7081; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7082; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7083; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7084; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7085;
7086; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
7087; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7088; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7089; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7090; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7091; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7092; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7093; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7094; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7095; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7096; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7097; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7098; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7099; GFX90A-TGSPLIT-NEXT:    s_endpgm
7100    i32* %out, i32 %in, i32 %old) {
7101entry:
7102  %gep = getelementptr i32, i32* %out, i32 4
7103  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
7104  ret void
7105}
7106
7107define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
7108; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7109; GFX7:       ; %bb.0: ; %entry
7110; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7111; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7112; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7113; GFX7-NEXT:    s_add_u32 s0, s0, 16
7114; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7115; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7116; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7117; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7118; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7119; GFX7-NEXT:    s_waitcnt vmcnt(0)
7120; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7121; GFX7-NEXT:    s_waitcnt vmcnt(0)
7122; GFX7-NEXT:    buffer_wbinvl1_vol
7123; GFX7-NEXT:    s_endpgm
7124;
7125; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7126; GFX10-WGP:       ; %bb.0: ; %entry
7127; GFX10-WGP-NEXT:    s_clause 0x1
7128; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7129; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7130; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7131; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7132; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7133; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7134; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7135; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7136; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7137; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7138; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7139; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7140; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7141; GFX10-WGP-NEXT:    buffer_gl0_inv
7142; GFX10-WGP-NEXT:    buffer_gl1_inv
7143; GFX10-WGP-NEXT:    s_endpgm
7144;
7145; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7146; GFX10-CU:       ; %bb.0: ; %entry
7147; GFX10-CU-NEXT:    s_clause 0x1
7148; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7149; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7150; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7151; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7152; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7153; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7154; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7155; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7156; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7157; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7158; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7159; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7160; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7161; GFX10-CU-NEXT:    buffer_gl0_inv
7162; GFX10-CU-NEXT:    buffer_gl1_inv
7163; GFX10-CU-NEXT:    s_endpgm
7164;
7165; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7166; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7167; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7168; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7169; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7170; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7171; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7172; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7173; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7174; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7175; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7176; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7177; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7178; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7179; SKIP-CACHE-INV-NEXT:    s_endpgm
7180;
7181; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7182; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7183; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7184; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7185; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7186; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7187; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7188; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7189; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7190; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7191; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7192; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7193; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7194; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7195;
7196; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
7197; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7198; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7199; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7200; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7201; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7202; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7203; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7204; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7205; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7206; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7207; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7208; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7209; GFX90A-TGSPLIT-NEXT:    s_endpgm
7210    i32* %out, i32 %in, i32 %old) {
7211entry:
7212  %gep = getelementptr i32, i32* %out, i32 4
7213  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
7214  ret void
7215}
7216
7217define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
7218; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7219; GFX7:       ; %bb.0: ; %entry
7220; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7221; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7222; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7223; GFX7-NEXT:    s_add_u32 s0, s0, 16
7224; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7225; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7226; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7227; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7228; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7229; GFX7-NEXT:    s_waitcnt vmcnt(0)
7230; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7231; GFX7-NEXT:    s_waitcnt vmcnt(0)
7232; GFX7-NEXT:    buffer_wbinvl1_vol
7233; GFX7-NEXT:    s_endpgm
7234;
7235; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7236; GFX10-WGP:       ; %bb.0: ; %entry
7237; GFX10-WGP-NEXT:    s_clause 0x1
7238; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7239; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7240; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7241; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7242; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7243; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7244; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7245; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7246; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7247; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7248; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7249; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7250; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7251; GFX10-WGP-NEXT:    buffer_gl0_inv
7252; GFX10-WGP-NEXT:    buffer_gl1_inv
7253; GFX10-WGP-NEXT:    s_endpgm
7254;
7255; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7256; GFX10-CU:       ; %bb.0: ; %entry
7257; GFX10-CU-NEXT:    s_clause 0x1
7258; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7259; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7260; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7261; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7262; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7263; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7264; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7265; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7266; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7267; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7268; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7269; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7270; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7271; GFX10-CU-NEXT:    buffer_gl0_inv
7272; GFX10-CU-NEXT:    buffer_gl1_inv
7273; GFX10-CU-NEXT:    s_endpgm
7274;
7275; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7276; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7277; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7278; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7279; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7280; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7281; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7282; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7283; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7285; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7286; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7287; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7288; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7289; SKIP-CACHE-INV-NEXT:    s_endpgm
7290;
7291; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7292; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7293; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7294; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7295; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7296; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7297; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7298; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7299; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7300; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7301; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7302; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7303; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7304; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7305;
7306; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
7307; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7308; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7309; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7310; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7311; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7312; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7313; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7314; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7315; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7316; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7317; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7318; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7319; GFX90A-TGSPLIT-NEXT:    s_endpgm
7320    i32* %out, i32 %in, i32 %old) {
7321entry:
7322  %gep = getelementptr i32, i32* %out, i32 4
7323  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
7324  ret void
7325}
7326
7327define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
7328; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7329; GFX7:       ; %bb.0: ; %entry
7330; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7331; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7333; GFX7-NEXT:    s_add_u32 s0, s0, 16
7334; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7335; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7336; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7337; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7338; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7339; GFX7-NEXT:    s_waitcnt vmcnt(0)
7340; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7341; GFX7-NEXT:    s_waitcnt vmcnt(0)
7342; GFX7-NEXT:    buffer_wbinvl1_vol
7343; GFX7-NEXT:    s_endpgm
7344;
7345; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7346; GFX10-WGP:       ; %bb.0: ; %entry
7347; GFX10-WGP-NEXT:    s_clause 0x1
7348; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7349; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7350; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7351; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7352; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7353; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7354; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7355; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7356; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7357; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7358; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7359; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7360; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7361; GFX10-WGP-NEXT:    buffer_gl0_inv
7362; GFX10-WGP-NEXT:    buffer_gl1_inv
7363; GFX10-WGP-NEXT:    s_endpgm
7364;
7365; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7366; GFX10-CU:       ; %bb.0: ; %entry
7367; GFX10-CU-NEXT:    s_clause 0x1
7368; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7369; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7370; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7371; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7372; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7373; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7374; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7375; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7376; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7377; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7378; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7379; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7380; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7381; GFX10-CU-NEXT:    buffer_gl0_inv
7382; GFX10-CU-NEXT:    buffer_gl1_inv
7383; GFX10-CU-NEXT:    s_endpgm
7384;
7385; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7386; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7387; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7388; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7389; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7390; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7391; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7393; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7394; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7395; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7396; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7397; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7398; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7399; SKIP-CACHE-INV-NEXT:    s_endpgm
7400;
7401; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7402; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7403; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7404; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7405; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7406; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7407; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7408; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7409; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7410; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7411; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7412; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7413; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7414; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7415;
7416; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
7417; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7418; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7419; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7420; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7421; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7422; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7423; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7424; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7425; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7426; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7427; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7428; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7429; GFX90A-TGSPLIT-NEXT:    s_endpgm
7430    i32* %out, i32 %in, i32 %old) {
7431entry:
7432  %gep = getelementptr i32, i32* %out, i32 4
7433  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
7434  ret void
7435}
7436
7437define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
7438; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7439; GFX7:       ; %bb.0: ; %entry
7440; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7441; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7442; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7443; GFX7-NEXT:    s_add_u32 s0, s0, 16
7444; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7445; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7446; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7447; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7448; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7449; GFX7-NEXT:    s_waitcnt vmcnt(0)
7450; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7451; GFX7-NEXT:    s_waitcnt vmcnt(0)
7452; GFX7-NEXT:    buffer_wbinvl1_vol
7453; GFX7-NEXT:    s_endpgm
7454;
7455; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7456; GFX10-WGP:       ; %bb.0: ; %entry
7457; GFX10-WGP-NEXT:    s_clause 0x1
7458; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7459; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7460; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7461; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7462; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7463; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7464; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7465; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7466; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7467; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7468; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7469; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7470; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7471; GFX10-WGP-NEXT:    buffer_gl0_inv
7472; GFX10-WGP-NEXT:    buffer_gl1_inv
7473; GFX10-WGP-NEXT:    s_endpgm
7474;
7475; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7476; GFX10-CU:       ; %bb.0: ; %entry
7477; GFX10-CU-NEXT:    s_clause 0x1
7478; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7479; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7480; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7481; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7482; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7483; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7484; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7485; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7486; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7487; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7488; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7489; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7490; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7491; GFX10-CU-NEXT:    buffer_gl0_inv
7492; GFX10-CU-NEXT:    buffer_gl1_inv
7493; GFX10-CU-NEXT:    s_endpgm
7494;
7495; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7496; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7497; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7498; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7499; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7500; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7501; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7502; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7503; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7504; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7505; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7506; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7507; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7508; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7509; SKIP-CACHE-INV-NEXT:    s_endpgm
7510;
7511; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7512; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7513; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7514; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7515; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7516; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7517; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7518; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7519; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7520; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7521; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7522; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7523; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7524; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7525;
7526; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
7527; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7528; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7529; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7530; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7531; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7532; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7533; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7534; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7535; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7536; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7537; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7538; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7539; GFX90A-TGSPLIT-NEXT:    s_endpgm
7540    i32* %out, i32 %in, i32 %old) {
7541entry:
7542  %gep = getelementptr i32, i32* %out, i32 4
7543  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
7544  ret void
7545}
7546
7547define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
7548; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7549; GFX7:       ; %bb.0: ; %entry
7550; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7551; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7552; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7553; GFX7-NEXT:    s_add_u32 s0, s0, 16
7554; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7555; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7556; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7557; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7558; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7559; GFX7-NEXT:    s_waitcnt vmcnt(0)
7560; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7561; GFX7-NEXT:    s_waitcnt vmcnt(0)
7562; GFX7-NEXT:    buffer_wbinvl1_vol
7563; GFX7-NEXT:    s_endpgm
7564;
7565; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7566; GFX10-WGP:       ; %bb.0: ; %entry
7567; GFX10-WGP-NEXT:    s_clause 0x1
7568; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7569; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7570; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7571; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7572; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7573; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7574; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7575; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7576; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7577; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7578; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7579; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7580; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7581; GFX10-WGP-NEXT:    buffer_gl0_inv
7582; GFX10-WGP-NEXT:    buffer_gl1_inv
7583; GFX10-WGP-NEXT:    s_endpgm
7584;
7585; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7586; GFX10-CU:       ; %bb.0: ; %entry
7587; GFX10-CU-NEXT:    s_clause 0x1
7588; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7589; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7590; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7591; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7592; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7593; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7594; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7595; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7596; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7597; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7598; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7599; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7600; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7601; GFX10-CU-NEXT:    buffer_gl0_inv
7602; GFX10-CU-NEXT:    buffer_gl1_inv
7603; GFX10-CU-NEXT:    s_endpgm
7604;
7605; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7606; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7607; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7608; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7609; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7610; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7611; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7612; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7613; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7616; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7617; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7618; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7619; SKIP-CACHE-INV-NEXT:    s_endpgm
7620;
7621; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7622; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7623; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7624; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7625; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7626; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7627; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7628; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7629; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7630; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7631; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7632; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7633; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7634; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7635;
7636; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
7637; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7638; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7639; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7640; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7641; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7642; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7643; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7644; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7645; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7646; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7647; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7648; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7649; GFX90A-TGSPLIT-NEXT:    s_endpgm
7650    i32* %out, i32 %in, i32 %old) {
7651entry:
7652  %gep = getelementptr i32, i32* %out, i32 4
7653  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
7654  ret void
7655}
7656
7657define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
7658; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7659; GFX7:       ; %bb.0: ; %entry
7660; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7661; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7662; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7663; GFX7-NEXT:    s_add_u32 s0, s0, 16
7664; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7665; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7666; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7667; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7668; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7669; GFX7-NEXT:    s_waitcnt vmcnt(0)
7670; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7671; GFX7-NEXT:    s_waitcnt vmcnt(0)
7672; GFX7-NEXT:    buffer_wbinvl1_vol
7673; GFX7-NEXT:    s_endpgm
7674;
7675; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7676; GFX10-WGP:       ; %bb.0: ; %entry
7677; GFX10-WGP-NEXT:    s_clause 0x1
7678; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7679; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7680; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7681; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7682; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7683; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7684; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7685; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7686; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7687; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7688; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7689; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7690; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7691; GFX10-WGP-NEXT:    buffer_gl0_inv
7692; GFX10-WGP-NEXT:    buffer_gl1_inv
7693; GFX10-WGP-NEXT:    s_endpgm
7694;
7695; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7696; GFX10-CU:       ; %bb.0: ; %entry
7697; GFX10-CU-NEXT:    s_clause 0x1
7698; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7699; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7700; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7701; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7702; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7703; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7704; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7705; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7706; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7707; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7708; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7709; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7710; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7711; GFX10-CU-NEXT:    buffer_gl0_inv
7712; GFX10-CU-NEXT:    buffer_gl1_inv
7713; GFX10-CU-NEXT:    s_endpgm
7714;
7715; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7716; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7717; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7718; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7719; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7720; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7721; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7722; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7723; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7724; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7725; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7726; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7727; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7728; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7729; SKIP-CACHE-INV-NEXT:    s_endpgm
7730;
7731; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7732; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7733; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7734; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7735; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7736; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7737; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7738; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7739; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7740; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7741; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7742; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7743; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7744; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7745;
7746; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
7747; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7748; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7749; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7750; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7751; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7752; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7753; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7754; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7755; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7756; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7757; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7758; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7759; GFX90A-TGSPLIT-NEXT:    s_endpgm
7760    i32* %out, i32 %in, i32 %old) {
7761entry:
7762  %gep = getelementptr i32, i32* %out, i32 4
7763  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
7764  ret void
7765}
7766
7767define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
7768; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7769; GFX7:       ; %bb.0: ; %entry
7770; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7771; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7772; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7773; GFX7-NEXT:    s_add_u32 s0, s0, 16
7774; GFX7-NEXT:    s_addc_u32 s1, s1, 0
7775; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7776; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7777; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7778; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7779; GFX7-NEXT:    s_waitcnt vmcnt(0)
7780; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7781; GFX7-NEXT:    s_waitcnt vmcnt(0)
7782; GFX7-NEXT:    buffer_wbinvl1_vol
7783; GFX7-NEXT:    s_endpgm
7784;
7785; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7786; GFX10-WGP:       ; %bb.0: ; %entry
7787; GFX10-WGP-NEXT:    s_clause 0x1
7788; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7789; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7790; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7791; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
7792; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
7793; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7794; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7795; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7796; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7797; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7798; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7799; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7800; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7801; GFX10-WGP-NEXT:    buffer_gl0_inv
7802; GFX10-WGP-NEXT:    buffer_gl1_inv
7803; GFX10-WGP-NEXT:    s_endpgm
7804;
7805; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7806; GFX10-CU:       ; %bb.0: ; %entry
7807; GFX10-CU-NEXT:    s_clause 0x1
7808; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7809; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7810; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7811; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
7812; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
7813; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7814; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7815; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7816; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7817; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7818; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7819; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7820; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7821; GFX10-CU-NEXT:    buffer_gl0_inv
7822; GFX10-CU-NEXT:    buffer_gl1_inv
7823; GFX10-CU-NEXT:    s_endpgm
7824;
7825; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7826; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7827; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7828; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7829; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7830; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
7831; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
7832; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7833; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7834; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7835; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7836; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7837; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
7838; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7839; SKIP-CACHE-INV-NEXT:    s_endpgm
7840;
7841; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7842; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7843; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7844; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7845; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7846; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7847; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7848; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7849; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7850; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7851; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7852; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7853; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7854; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7855;
7856; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
7857; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7858; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7859; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7860; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7861; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7862; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7863; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7864; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7865; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
7866; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7867; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7868; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7869; GFX90A-TGSPLIT-NEXT:    s_endpgm
7870    i32* %out, i32 %in, i32 %old) {
7871entry:
7872  %gep = getelementptr i32, i32* %out, i32 4
7873  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
7874  ret void
7875}
7876
7877define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
7878; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7879; GFX7:       ; %bb.0: ; %entry
7880; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7881; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7882; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7883; GFX7-NEXT:    s_add_u32 s4, s0, 16
7884; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7885; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7886; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7887; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7888; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7889; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7890; GFX7-NEXT:    v_mov_b32_e32 v0, s0
7891; GFX7-NEXT:    v_mov_b32_e32 v1, s1
7892; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7893; GFX7-NEXT:    flat_store_dword v[0:1], v2
7894; GFX7-NEXT:    s_endpgm
7895;
7896; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7897; GFX10-WGP:       ; %bb.0: ; %entry
7898; GFX10-WGP-NEXT:    s_clause 0x1
7899; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7900; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7901; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7902; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
7903; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
7904; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7905; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
7906; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7907; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
7908; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7909; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
7910; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
7911; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7912; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7913; GFX10-WGP-NEXT:    s_endpgm
7914;
7915; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7916; GFX10-CU:       ; %bb.0: ; %entry
7917; GFX10-CU-NEXT:    s_clause 0x1
7918; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7919; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7920; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7921; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
7922; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
7923; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7924; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
7925; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7926; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
7927; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7928; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
7929; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
7930; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7931; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7932; GFX10-CU-NEXT:    s_endpgm
7933;
7934; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7935; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7936; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
7937; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7938; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7939; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
7940; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
7941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
7943; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
7944; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
7945; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7946; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7947; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7948; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7949; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7950; SKIP-CACHE-INV-NEXT:    s_endpgm
7951;
7952; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7953; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7954; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7955; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7956; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7957; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7958; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7959; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7960; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7961; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7962; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7963;
7964; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
7965; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7966; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
7968; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7969; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
7970; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
7971; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7972; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7973; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7974; GFX90A-TGSPLIT-NEXT:    s_endpgm
7975    i32* %out, i32 %in, i32 %old) {
7976entry:
7977  %gep = getelementptr i32, i32* %out, i32 4
7978  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
7979  %val0 = extractvalue { i32, i1 } %val, 0
7980  store i32 %val0, i32* %out, align 4
7981  ret void
7982}
7983
7984define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
7985; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
7986; GFX7:       ; %bb.0: ; %entry
7987; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7988; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
7989; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7990; GFX7-NEXT:    s_add_u32 s4, s0, 16
7991; GFX7-NEXT:    s_addc_u32 s5, s1, 0
7992; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7993; GFX7-NEXT:    v_mov_b32_e32 v2, s2
7994; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7995; GFX7-NEXT:    v_mov_b32_e32 v3, s3
7996; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7997; GFX7-NEXT:    s_waitcnt vmcnt(0)
7998; GFX7-NEXT:    buffer_wbinvl1_vol
7999; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8000; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8002; GFX7-NEXT:    flat_store_dword v[0:1], v2
8003; GFX7-NEXT:    s_endpgm
8004;
8005; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
8006; GFX10-WGP:       ; %bb.0: ; %entry
8007; GFX10-WGP-NEXT:    s_clause 0x1
8008; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8009; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8010; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8011; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8012; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8013; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8014; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8015; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8016; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8017; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8018; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8019; GFX10-WGP-NEXT:    buffer_gl0_inv
8020; GFX10-WGP-NEXT:    buffer_gl1_inv
8021; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8022; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8023; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8024; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8025; GFX10-WGP-NEXT:    s_endpgm
8026;
8027; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
8028; GFX10-CU:       ; %bb.0: ; %entry
8029; GFX10-CU-NEXT:    s_clause 0x1
8030; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8031; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8032; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8033; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8034; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8035; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8036; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8037; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8038; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8039; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8040; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8041; GFX10-CU-NEXT:    buffer_gl0_inv
8042; GFX10-CU-NEXT:    buffer_gl1_inv
8043; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8044; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8045; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8046; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8047; GFX10-CU-NEXT:    s_endpgm
8048;
8049; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
8050; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8051; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8052; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8053; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8054; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8055; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8056; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8059; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8060; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8061; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8062; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8064; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8065; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8066; SKIP-CACHE-INV-NEXT:    s_endpgm
8067;
8068; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
8069; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8070; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8071; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8072; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8073; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8074; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8075; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8076; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8077; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8078; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8079; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8080; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8081; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8082;
8083; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
8084; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8085; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8086; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8087; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8088; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8089; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8090; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8091; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8092; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8093; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8094; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8095; GFX90A-TGSPLIT-NEXT:    s_endpgm
8096    i32* %out, i32 %in, i32 %old) {
8097entry:
8098  %gep = getelementptr i32, i32* %out, i32 4
8099  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
8100  %val0 = extractvalue { i32, i1 } %val, 0
8101  store i32 %val0, i32* %out, align 4
8102  ret void
8103}
8104
8105define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
8106; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8107; GFX7:       ; %bb.0: ; %entry
8108; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8109; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8110; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8111; GFX7-NEXT:    s_add_u32 s4, s0, 16
8112; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8113; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8114; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8115; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8116; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8117; GFX7-NEXT:    s_waitcnt vmcnt(0)
8118; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8119; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8120; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8121; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8122; GFX7-NEXT:    flat_store_dword v[0:1], v2
8123; GFX7-NEXT:    s_endpgm
8124;
8125; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8126; GFX10-WGP:       ; %bb.0: ; %entry
8127; GFX10-WGP-NEXT:    s_clause 0x1
8128; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8129; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8130; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8131; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8132; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8133; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8134; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8135; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8136; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8137; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8138; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8139; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8140; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8141; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8142; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8143; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8144; GFX10-WGP-NEXT:    s_endpgm
8145;
8146; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8147; GFX10-CU:       ; %bb.0: ; %entry
8148; GFX10-CU-NEXT:    s_clause 0x1
8149; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8150; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8151; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8152; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8153; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8154; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8155; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8156; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8157; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8158; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8159; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8160; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8161; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8163; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8164; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8165; GFX10-CU-NEXT:    s_endpgm
8166;
8167; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8168; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8169; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8170; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8171; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8172; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8173; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8174; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8175; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8176; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8177; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8178; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8179; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8180; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8182; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8183; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8184; SKIP-CACHE-INV-NEXT:    s_endpgm
8185;
8186; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8187; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8188; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8189; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8190; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8191; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8192; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8193; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8194; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8195; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8196; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8197; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8198; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8199;
8200; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
8201; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8202; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8203; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8204; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8205; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8206; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8207; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8208; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8209; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8210; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8211; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8212; GFX90A-TGSPLIT-NEXT:    s_endpgm
8213    i32* %out, i32 %in, i32 %old) {
8214entry:
8215  %gep = getelementptr i32, i32* %out, i32 4
8216  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
8217  %val0 = extractvalue { i32, i1 } %val, 0
8218  store i32 %val0, i32* %out, align 4
8219  ret void
8220}
8221
8222define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
8223; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8224; GFX7:       ; %bb.0: ; %entry
8225; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8226; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8227; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8228; GFX7-NEXT:    s_add_u32 s4, s0, 16
8229; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8230; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8231; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8232; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8233; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8234; GFX7-NEXT:    s_waitcnt vmcnt(0)
8235; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8236; GFX7-NEXT:    s_waitcnt vmcnt(0)
8237; GFX7-NEXT:    buffer_wbinvl1_vol
8238; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8239; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8240; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8241; GFX7-NEXT:    flat_store_dword v[0:1], v2
8242; GFX7-NEXT:    s_endpgm
8243;
8244; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8245; GFX10-WGP:       ; %bb.0: ; %entry
8246; GFX10-WGP-NEXT:    s_clause 0x1
8247; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8248; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8249; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8250; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8251; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8252; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8253; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8254; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8255; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8256; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8257; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8258; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8259; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8260; GFX10-WGP-NEXT:    buffer_gl0_inv
8261; GFX10-WGP-NEXT:    buffer_gl1_inv
8262; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8263; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8264; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8265; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8266; GFX10-WGP-NEXT:    s_endpgm
8267;
8268; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8269; GFX10-CU:       ; %bb.0: ; %entry
8270; GFX10-CU-NEXT:    s_clause 0x1
8271; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8272; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8273; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8274; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8275; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8276; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8277; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8278; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8279; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8280; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8281; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8282; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8283; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8284; GFX10-CU-NEXT:    buffer_gl0_inv
8285; GFX10-CU-NEXT:    buffer_gl1_inv
8286; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8287; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8288; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8289; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8290; GFX10-CU-NEXT:    s_endpgm
8291;
8292; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8293; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8294; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8295; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8296; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8297; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8298; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8299; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8301; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8302; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8303; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8304; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8305; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8308; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8309; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8310; SKIP-CACHE-INV-NEXT:    s_endpgm
8311;
8312; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8313; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8314; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8316; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8317; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8318; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8319; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8320; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8321; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8322; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8323; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8324; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8325; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8326; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8327; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8328;
8329; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
8330; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8331; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8332; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8333; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8334; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8335; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8336; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8337; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8338; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8339; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8340; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8341; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8342; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8343; GFX90A-TGSPLIT-NEXT:    s_endpgm
8344    i32* %out, i32 %in, i32 %old) {
8345entry:
8346  %gep = getelementptr i32, i32* %out, i32 4
8347  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
8348  %val0 = extractvalue { i32, i1 } %val, 0
8349  store i32 %val0, i32* %out, align 4
8350  ret void
8351}
8352
8353define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
8354; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8355; GFX7:       ; %bb.0: ; %entry
8356; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8357; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8359; GFX7-NEXT:    s_add_u32 s4, s0, 16
8360; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8361; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8362; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8363; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8364; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8365; GFX7-NEXT:    s_waitcnt vmcnt(0)
8366; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8367; GFX7-NEXT:    s_waitcnt vmcnt(0)
8368; GFX7-NEXT:    buffer_wbinvl1_vol
8369; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8370; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8371; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8372; GFX7-NEXT:    flat_store_dword v[0:1], v2
8373; GFX7-NEXT:    s_endpgm
8374;
8375; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8376; GFX10-WGP:       ; %bb.0: ; %entry
8377; GFX10-WGP-NEXT:    s_clause 0x1
8378; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8379; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8380; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8381; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8382; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8383; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8384; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8385; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8386; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8387; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8388; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8389; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8390; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8391; GFX10-WGP-NEXT:    buffer_gl0_inv
8392; GFX10-WGP-NEXT:    buffer_gl1_inv
8393; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8394; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8395; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8396; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8397; GFX10-WGP-NEXT:    s_endpgm
8398;
8399; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8400; GFX10-CU:       ; %bb.0: ; %entry
8401; GFX10-CU-NEXT:    s_clause 0x1
8402; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8403; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8404; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8405; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8406; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8407; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8408; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8409; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8410; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8411; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8412; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8413; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8414; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8415; GFX10-CU-NEXT:    buffer_gl0_inv
8416; GFX10-CU-NEXT:    buffer_gl1_inv
8417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8418; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8419; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8420; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8421; GFX10-CU-NEXT:    s_endpgm
8422;
8423; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8424; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8425; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8426; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8427; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8428; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8429; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8432; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8433; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8434; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8435; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8436; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8437; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8438; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8439; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8440; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8441; SKIP-CACHE-INV-NEXT:    s_endpgm
8442;
8443; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8444; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8445; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8446; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8447; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8448; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8449; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8450; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8451; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8452; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8453; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8454; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8455; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8456; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8457; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8458; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8459;
8460; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
8461; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8462; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8463; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8464; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8465; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8466; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8467; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8468; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8469; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8470; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8471; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8472; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8473; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8474; GFX90A-TGSPLIT-NEXT:    s_endpgm
8475    i32* %out, i32 %in, i32 %old) {
8476entry:
8477  %gep = getelementptr i32, i32* %out, i32 4
8478  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
8479  %val0 = extractvalue { i32, i1 } %val, 0
8480  store i32 %val0, i32* %out, align 4
8481  ret void
8482}
8483
8484define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
8485; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8486; GFX7:       ; %bb.0: ; %entry
8487; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8488; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8489; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8490; GFX7-NEXT:    s_add_u32 s4, s0, 16
8491; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8492; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8493; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8494; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8495; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8496; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8497; GFX7-NEXT:    s_waitcnt vmcnt(0)
8498; GFX7-NEXT:    buffer_wbinvl1_vol
8499; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8500; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8501; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8502; GFX7-NEXT:    flat_store_dword v[0:1], v2
8503; GFX7-NEXT:    s_endpgm
8504;
8505; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8506; GFX10-WGP:       ; %bb.0: ; %entry
8507; GFX10-WGP-NEXT:    s_clause 0x1
8508; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8509; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8510; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8511; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8512; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8513; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8514; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8515; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8516; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8517; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8518; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8519; GFX10-WGP-NEXT:    buffer_gl0_inv
8520; GFX10-WGP-NEXT:    buffer_gl1_inv
8521; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8522; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8523; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8524; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8525; GFX10-WGP-NEXT:    s_endpgm
8526;
8527; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8528; GFX10-CU:       ; %bb.0: ; %entry
8529; GFX10-CU-NEXT:    s_clause 0x1
8530; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8531; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8532; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8533; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8534; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8535; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8536; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8537; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8538; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8539; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8540; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8541; GFX10-CU-NEXT:    buffer_gl0_inv
8542; GFX10-CU-NEXT:    buffer_gl1_inv
8543; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8544; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8545; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8546; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8547; GFX10-CU-NEXT:    s_endpgm
8548;
8549; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8550; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8551; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8552; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8553; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8554; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8555; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8557; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8558; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8560; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8561; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8564; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8565; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8566; SKIP-CACHE-INV-NEXT:    s_endpgm
8567;
8568; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8569; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8570; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8571; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8572; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8573; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8574; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8575; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8576; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8577; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8578; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8579; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8580; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8581; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8582;
8583; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
8584; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8585; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8586; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8587; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8588; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8589; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8590; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8591; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8592; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8593; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8594; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8595; GFX90A-TGSPLIT-NEXT:    s_endpgm
8596    i32* %out, i32 %in, i32 %old) {
8597entry:
8598  %gep = getelementptr i32, i32* %out, i32 4
8599  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
8600  %val0 = extractvalue { i32, i1 } %val, 0
8601  store i32 %val0, i32* %out, align 4
8602  ret void
8603}
8604
8605define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
8606; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8607; GFX7:       ; %bb.0: ; %entry
8608; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8609; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8610; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8611; GFX7-NEXT:    s_add_u32 s4, s0, 16
8612; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8613; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8614; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8615; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8616; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8617; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8618; GFX7-NEXT:    s_waitcnt vmcnt(0)
8619; GFX7-NEXT:    buffer_wbinvl1_vol
8620; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8621; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8622; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8623; GFX7-NEXT:    flat_store_dword v[0:1], v2
8624; GFX7-NEXT:    s_endpgm
8625;
8626; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8627; GFX10-WGP:       ; %bb.0: ; %entry
8628; GFX10-WGP-NEXT:    s_clause 0x1
8629; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8630; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8631; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8632; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8633; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8634; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8635; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8636; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8637; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8638; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8639; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8640; GFX10-WGP-NEXT:    buffer_gl0_inv
8641; GFX10-WGP-NEXT:    buffer_gl1_inv
8642; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8643; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8644; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8645; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8646; GFX10-WGP-NEXT:    s_endpgm
8647;
8648; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8649; GFX10-CU:       ; %bb.0: ; %entry
8650; GFX10-CU-NEXT:    s_clause 0x1
8651; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8652; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8653; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8654; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8655; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8656; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8657; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8658; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8659; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8660; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8661; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8662; GFX10-CU-NEXT:    buffer_gl0_inv
8663; GFX10-CU-NEXT:    buffer_gl1_inv
8664; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8665; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8666; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8667; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8668; GFX10-CU-NEXT:    s_endpgm
8669;
8670; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8671; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8672; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8673; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8674; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8675; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8676; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8679; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8681; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8682; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8685; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8686; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8687; SKIP-CACHE-INV-NEXT:    s_endpgm
8688;
8689; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8690; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8691; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8692; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8693; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8694; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8695; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8696; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8697; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8698; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8699; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8700; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8701; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8702; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8703;
8704; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
8705; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8706; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8707; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8708; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8709; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8710; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8711; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8712; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8713; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8714; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8715; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8716; GFX90A-TGSPLIT-NEXT:    s_endpgm
8717    i32* %out, i32 %in, i32 %old) {
8718entry:
8719  %gep = getelementptr i32, i32* %out, i32 4
8720  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
8721  %val0 = extractvalue { i32, i1 } %val, 0
8722  store i32 %val0, i32* %out, align 4
8723  ret void
8724}
8725
8726define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
8727; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8728; GFX7:       ; %bb.0: ; %entry
8729; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8730; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8731; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8732; GFX7-NEXT:    s_add_u32 s4, s0, 16
8733; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8734; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8735; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8736; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8737; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8738; GFX7-NEXT:    s_waitcnt vmcnt(0)
8739; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8740; GFX7-NEXT:    s_waitcnt vmcnt(0)
8741; GFX7-NEXT:    buffer_wbinvl1_vol
8742; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8743; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8744; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8745; GFX7-NEXT:    flat_store_dword v[0:1], v2
8746; GFX7-NEXT:    s_endpgm
8747;
8748; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8749; GFX10-WGP:       ; %bb.0: ; %entry
8750; GFX10-WGP-NEXT:    s_clause 0x1
8751; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8752; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8753; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8754; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8755; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8756; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8757; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8758; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8759; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8760; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8761; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8762; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8763; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8764; GFX10-WGP-NEXT:    buffer_gl0_inv
8765; GFX10-WGP-NEXT:    buffer_gl1_inv
8766; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8768; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8769; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8770; GFX10-WGP-NEXT:    s_endpgm
8771;
8772; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8773; GFX10-CU:       ; %bb.0: ; %entry
8774; GFX10-CU-NEXT:    s_clause 0x1
8775; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8776; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8777; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8778; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8779; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8780; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8781; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8782; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8783; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8784; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8785; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8786; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8787; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8788; GFX10-CU-NEXT:    buffer_gl0_inv
8789; GFX10-CU-NEXT:    buffer_gl1_inv
8790; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8791; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8792; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8793; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8794; GFX10-CU-NEXT:    s_endpgm
8795;
8796; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8797; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8798; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8799; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8801; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8802; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8806; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8807; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8808; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8809; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8811; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8812; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8813; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8814; SKIP-CACHE-INV-NEXT:    s_endpgm
8815;
8816; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8817; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8818; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8819; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8820; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8821; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8822; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8823; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8824; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8825; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8826; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8827; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8828; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8829; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8830; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8831; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8832;
8833; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
8834; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8835; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8836; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8837; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8838; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8839; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8840; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8841; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8842; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8843; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8844; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8845; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8846; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8847; GFX90A-TGSPLIT-NEXT:    s_endpgm
8848    i32* %out, i32 %in, i32 %old) {
8849entry:
8850  %gep = getelementptr i32, i32* %out, i32 4
8851  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
8852  %val0 = extractvalue { i32, i1 } %val, 0
8853  store i32 %val0, i32* %out, align 4
8854  ret void
8855}
8856
8857define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
8858; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8859; GFX7:       ; %bb.0: ; %entry
8860; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8861; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8862; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8863; GFX7-NEXT:    s_add_u32 s4, s0, 16
8864; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8865; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8866; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8867; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8868; GFX7-NEXT:    v_mov_b32_e32 v3, s3
8869; GFX7-NEXT:    s_waitcnt vmcnt(0)
8870; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8871; GFX7-NEXT:    s_waitcnt vmcnt(0)
8872; GFX7-NEXT:    buffer_wbinvl1_vol
8873; GFX7-NEXT:    v_mov_b32_e32 v0, s0
8874; GFX7-NEXT:    v_mov_b32_e32 v1, s1
8875; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8876; GFX7-NEXT:    flat_store_dword v[0:1], v2
8877; GFX7-NEXT:    s_endpgm
8878;
8879; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8880; GFX10-WGP:       ; %bb.0: ; %entry
8881; GFX10-WGP-NEXT:    s_clause 0x1
8882; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8883; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8884; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8885; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
8886; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
8887; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8888; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
8889; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8890; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
8891; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8892; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8893; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8894; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8895; GFX10-WGP-NEXT:    buffer_gl0_inv
8896; GFX10-WGP-NEXT:    buffer_gl1_inv
8897; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
8898; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
8899; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8900; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8901; GFX10-WGP-NEXT:    s_endpgm
8902;
8903; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8904; GFX10-CU:       ; %bb.0: ; %entry
8905; GFX10-CU-NEXT:    s_clause 0x1
8906; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8907; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8908; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8909; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
8910; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
8911; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8912; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
8913; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8914; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
8915; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8916; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8917; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8918; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8919; GFX10-CU-NEXT:    buffer_gl0_inv
8920; GFX10-CU-NEXT:    buffer_gl1_inv
8921; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
8922; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
8923; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8924; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8925; GFX10-CU-NEXT:    s_endpgm
8926;
8927; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8928; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8929; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
8930; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8931; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8932; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
8933; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
8934; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8935; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
8936; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
8937; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
8938; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8939; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8940; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8943; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8944; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8945; SKIP-CACHE-INV-NEXT:    s_endpgm
8946;
8947; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8948; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8949; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8950; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8951; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8952; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8953; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8954; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8955; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8956; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8957; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8958; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8959; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8960; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8961; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8962; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8963;
8964; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
8965; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8966; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
8968; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8969; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
8970; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
8971; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8972; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8973; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8974; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8975; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8976; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8977; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8978; GFX90A-TGSPLIT-NEXT:    s_endpgm
8979    i32* %out, i32 %in, i32 %old) {
8980entry:
8981  %gep = getelementptr i32, i32* %out, i32 4
8982  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
8983  %val0 = extractvalue { i32, i1 } %val, 0
8984  store i32 %val0, i32* %out, align 4
8985  ret void
8986}
8987
8988define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
8989; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
8990; GFX7:       ; %bb.0: ; %entry
8991; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8992; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
8993; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8994; GFX7-NEXT:    s_add_u32 s4, s0, 16
8995; GFX7-NEXT:    s_addc_u32 s5, s1, 0
8996; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8997; GFX7-NEXT:    v_mov_b32_e32 v2, s2
8998; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8999; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9000; GFX7-NEXT:    s_waitcnt vmcnt(0)
9001; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9002; GFX7-NEXT:    s_waitcnt vmcnt(0)
9003; GFX7-NEXT:    buffer_wbinvl1_vol
9004; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9005; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9006; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9007; GFX7-NEXT:    flat_store_dword v[0:1], v2
9008; GFX7-NEXT:    s_endpgm
9009;
9010; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
9011; GFX10-WGP:       ; %bb.0: ; %entry
9012; GFX10-WGP-NEXT:    s_clause 0x1
9013; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9014; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9015; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9016; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9017; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9018; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9019; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9020; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9021; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9022; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9023; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9024; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9025; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9026; GFX10-WGP-NEXT:    buffer_gl0_inv
9027; GFX10-WGP-NEXT:    buffer_gl1_inv
9028; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9029; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9030; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9031; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9032; GFX10-WGP-NEXT:    s_endpgm
9033;
9034; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
9035; GFX10-CU:       ; %bb.0: ; %entry
9036; GFX10-CU-NEXT:    s_clause 0x1
9037; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9038; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9039; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9040; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9041; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9042; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9043; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9044; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9045; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9046; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9047; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9048; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9049; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9050; GFX10-CU-NEXT:    buffer_gl0_inv
9051; GFX10-CU-NEXT:    buffer_gl1_inv
9052; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9053; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9054; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9055; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9056; GFX10-CU-NEXT:    s_endpgm
9057;
9058; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
9059; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9060; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9061; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9062; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9063; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9064; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9065; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9066; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9069; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9070; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9071; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9074; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9075; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9076; SKIP-CACHE-INV-NEXT:    s_endpgm
9077;
9078; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
9079; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9080; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9083; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9084; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9085; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9086; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9087; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9088; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9089; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9090; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9091; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9092; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9093; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9094;
9095; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
9096; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9097; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9098; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9099; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9100; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9101; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9102; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9103; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9104; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9105; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9106; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9107; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9108; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9109; GFX90A-TGSPLIT-NEXT:    s_endpgm
9110    i32* %out, i32 %in, i32 %old) {
9111entry:
9112  %gep = getelementptr i32, i32* %out, i32 4
9113  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
9114  %val0 = extractvalue { i32, i1 } %val, 0
9115  store i32 %val0, i32* %out, align 4
9116  ret void
9117}
9118
9119define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
9120; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9121; GFX7:       ; %bb.0: ; %entry
9122; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9123; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9124; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9125; GFX7-NEXT:    s_add_u32 s4, s0, 16
9126; GFX7-NEXT:    s_addc_u32 s5, s1, 0
9127; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9128; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9129; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9130; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9131; GFX7-NEXT:    s_waitcnt vmcnt(0)
9132; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9133; GFX7-NEXT:    s_waitcnt vmcnt(0)
9134; GFX7-NEXT:    buffer_wbinvl1_vol
9135; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9136; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9138; GFX7-NEXT:    flat_store_dword v[0:1], v2
9139; GFX7-NEXT:    s_endpgm
9140;
9141; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9142; GFX10-WGP:       ; %bb.0: ; %entry
9143; GFX10-WGP-NEXT:    s_clause 0x1
9144; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9145; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9146; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9147; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9148; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9149; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9150; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9151; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9152; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9153; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9154; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9155; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9156; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9157; GFX10-WGP-NEXT:    buffer_gl0_inv
9158; GFX10-WGP-NEXT:    buffer_gl1_inv
9159; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9160; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9161; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9162; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9163; GFX10-WGP-NEXT:    s_endpgm
9164;
9165; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9166; GFX10-CU:       ; %bb.0: ; %entry
9167; GFX10-CU-NEXT:    s_clause 0x1
9168; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9169; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9170; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9171; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9172; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9173; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9174; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9175; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9176; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9177; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9178; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9179; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9180; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9181; GFX10-CU-NEXT:    buffer_gl0_inv
9182; GFX10-CU-NEXT:    buffer_gl1_inv
9183; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9184; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9185; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9186; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9187; GFX10-CU-NEXT:    s_endpgm
9188;
9189; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9190; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9191; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9192; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9193; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9194; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9195; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9196; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9197; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9198; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9199; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9200; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9201; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9202; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9203; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9205; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9206; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9207; SKIP-CACHE-INV-NEXT:    s_endpgm
9208;
9209; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9210; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9211; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9212; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9213; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9214; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9215; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9216; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9217; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9218; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9219; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9220; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9221; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9223; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9224; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9225;
9226; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
9227; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9228; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9229; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9230; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9231; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9232; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9233; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9234; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9235; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9236; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9237; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9238; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9239; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9240; GFX90A-TGSPLIT-NEXT:    s_endpgm
9241    i32* %out, i32 %in, i32 %old) {
9242entry:
9243  %gep = getelementptr i32, i32* %out, i32 4
9244  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
9245  %val0 = extractvalue { i32, i1 } %val, 0
9246  store i32 %val0, i32* %out, align 4
9247  ret void
9248}
9249
9250define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
9251; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9252; GFX7:       ; %bb.0: ; %entry
9253; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9254; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9256; GFX7-NEXT:    s_add_u32 s4, s0, 16
9257; GFX7-NEXT:    s_addc_u32 s5, s1, 0
9258; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9259; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9260; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9261; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9262; GFX7-NEXT:    s_waitcnt vmcnt(0)
9263; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9264; GFX7-NEXT:    s_waitcnt vmcnt(0)
9265; GFX7-NEXT:    buffer_wbinvl1_vol
9266; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9267; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9268; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9269; GFX7-NEXT:    flat_store_dword v[0:1], v2
9270; GFX7-NEXT:    s_endpgm
9271;
9272; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9273; GFX10-WGP:       ; %bb.0: ; %entry
9274; GFX10-WGP-NEXT:    s_clause 0x1
9275; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9276; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9277; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9278; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9279; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9280; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9281; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9282; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9283; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9284; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9285; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9286; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9287; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9288; GFX10-WGP-NEXT:    buffer_gl0_inv
9289; GFX10-WGP-NEXT:    buffer_gl1_inv
9290; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9291; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9292; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9293; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9294; GFX10-WGP-NEXT:    s_endpgm
9295;
9296; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9297; GFX10-CU:       ; %bb.0: ; %entry
9298; GFX10-CU-NEXT:    s_clause 0x1
9299; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9300; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9302; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9303; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9304; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9305; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9306; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9307; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9308; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9309; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9310; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9311; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9312; GFX10-CU-NEXT:    buffer_gl0_inv
9313; GFX10-CU-NEXT:    buffer_gl1_inv
9314; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9315; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9316; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9317; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9318; GFX10-CU-NEXT:    s_endpgm
9319;
9320; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9321; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9322; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9323; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9324; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9325; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9326; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9329; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9331; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9332; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9333; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9334; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9336; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9337; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9338; SKIP-CACHE-INV-NEXT:    s_endpgm
9339;
9340; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9341; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9342; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9343; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9344; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9345; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9346; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9347; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9348; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9349; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9350; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9351; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9352; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9353; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9354; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9355; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9356;
9357; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
9358; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9359; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9360; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9361; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9362; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9363; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9364; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9365; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9366; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9367; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9368; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9369; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9370; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9371; GFX90A-TGSPLIT-NEXT:    s_endpgm
9372    i32* %out, i32 %in, i32 %old) {
9373entry:
9374  %gep = getelementptr i32, i32* %out, i32 4
9375  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
9376  %val0 = extractvalue { i32, i1 } %val, 0
9377  store i32 %val0, i32* %out, align 4
9378  ret void
9379}
9380
9381define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
9382; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9383; GFX7:       ; %bb.0: ; %entry
9384; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9385; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9386; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9387; GFX7-NEXT:    s_add_u32 s4, s0, 16
9388; GFX7-NEXT:    s_addc_u32 s5, s1, 0
9389; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9390; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9391; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9392; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9393; GFX7-NEXT:    s_waitcnt vmcnt(0)
9394; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9395; GFX7-NEXT:    s_waitcnt vmcnt(0)
9396; GFX7-NEXT:    buffer_wbinvl1_vol
9397; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9398; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9399; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9400; GFX7-NEXT:    flat_store_dword v[0:1], v2
9401; GFX7-NEXT:    s_endpgm
9402;
9403; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9404; GFX10-WGP:       ; %bb.0: ; %entry
9405; GFX10-WGP-NEXT:    s_clause 0x1
9406; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9407; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9408; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9409; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9410; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9411; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9412; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9413; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9414; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9415; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9416; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9417; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9418; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9419; GFX10-WGP-NEXT:    buffer_gl0_inv
9420; GFX10-WGP-NEXT:    buffer_gl1_inv
9421; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9422; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9423; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9424; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9425; GFX10-WGP-NEXT:    s_endpgm
9426;
9427; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9428; GFX10-CU:       ; %bb.0: ; %entry
9429; GFX10-CU-NEXT:    s_clause 0x1
9430; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9431; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9432; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9433; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9434; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9435; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9436; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9437; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9438; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9439; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9440; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9441; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9442; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9443; GFX10-CU-NEXT:    buffer_gl0_inv
9444; GFX10-CU-NEXT:    buffer_gl1_inv
9445; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9446; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9447; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9448; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9449; GFX10-CU-NEXT:    s_endpgm
9450;
9451; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9452; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9453; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9454; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9455; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9456; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9457; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9458; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9459; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9460; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9461; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9462; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9463; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9464; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9465; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9466; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9467; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9468; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9469; SKIP-CACHE-INV-NEXT:    s_endpgm
9470;
9471; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9472; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9473; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9474; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9475; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9476; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9477; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9478; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9479; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9480; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9481; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9482; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9483; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9484; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9485; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9486; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9487;
9488; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
9489; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9490; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9491; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9492; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9493; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9494; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9495; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9496; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9497; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9498; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9499; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9500; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9501; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9502; GFX90A-TGSPLIT-NEXT:    s_endpgm
9503    i32* %out, i32 %in, i32 %old) {
9504entry:
9505  %gep = getelementptr i32, i32* %out, i32 4
9506  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
9507  %val0 = extractvalue { i32, i1 } %val, 0
9508  store i32 %val0, i32* %out, align 4
9509  ret void
9510}
9511
9512define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
9513; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9514; GFX7:       ; %bb.0: ; %entry
9515; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9516; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9517; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9518; GFX7-NEXT:    s_add_u32 s4, s0, 16
9519; GFX7-NEXT:    s_addc_u32 s5, s1, 0
9520; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9521; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9522; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9523; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9524; GFX7-NEXT:    s_waitcnt vmcnt(0)
9525; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9526; GFX7-NEXT:    s_waitcnt vmcnt(0)
9527; GFX7-NEXT:    buffer_wbinvl1_vol
9528; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9529; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9530; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9531; GFX7-NEXT:    flat_store_dword v[0:1], v2
9532; GFX7-NEXT:    s_endpgm
9533;
9534; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9535; GFX10-WGP:       ; %bb.0: ; %entry
9536; GFX10-WGP-NEXT:    s_clause 0x1
9537; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9538; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9539; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9540; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9541; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9542; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9543; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9544; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9545; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9546; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9547; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9548; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9549; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9550; GFX10-WGP-NEXT:    buffer_gl0_inv
9551; GFX10-WGP-NEXT:    buffer_gl1_inv
9552; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9553; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9554; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9555; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9556; GFX10-WGP-NEXT:    s_endpgm
9557;
9558; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9559; GFX10-CU:       ; %bb.0: ; %entry
9560; GFX10-CU-NEXT:    s_clause 0x1
9561; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9562; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9563; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9564; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9565; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9566; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9567; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9568; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9569; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9570; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9571; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9572; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9573; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9574; GFX10-CU-NEXT:    buffer_gl0_inv
9575; GFX10-CU-NEXT:    buffer_gl1_inv
9576; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9577; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9578; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9579; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9580; GFX10-CU-NEXT:    s_endpgm
9581;
9582; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9583; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9584; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9585; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9586; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9587; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9588; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9592; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9593; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9594; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9595; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9598; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9599; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9600; SKIP-CACHE-INV-NEXT:    s_endpgm
9601;
9602; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9603; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9604; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9605; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9606; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9607; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9608; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9609; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9610; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9611; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9612; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9613; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9614; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9615; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9616; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9617; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9618;
9619; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
9620; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9621; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9622; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9623; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9624; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9625; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9626; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9627; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9628; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9629; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9630; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9631; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9632; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9633; GFX90A-TGSPLIT-NEXT:    s_endpgm
9634    i32* %out, i32 %in, i32 %old) {
9635entry:
9636  %gep = getelementptr i32, i32* %out, i32 4
9637  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
9638  %val0 = extractvalue { i32, i1 } %val, 0
9639  store i32 %val0, i32* %out, align 4
9640  ret void
9641}
9642
9643define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
9644; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9645; GFX7:       ; %bb.0: ; %entry
9646; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9647; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
9648; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9649; GFX7-NEXT:    s_add_u32 s4, s0, 16
9650; GFX7-NEXT:    s_addc_u32 s5, s1, 0
9651; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9652; GFX7-NEXT:    v_mov_b32_e32 v2, s2
9653; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9654; GFX7-NEXT:    v_mov_b32_e32 v3, s3
9655; GFX7-NEXT:    s_waitcnt vmcnt(0)
9656; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9657; GFX7-NEXT:    s_waitcnt vmcnt(0)
9658; GFX7-NEXT:    buffer_wbinvl1_vol
9659; GFX7-NEXT:    v_mov_b32_e32 v0, s0
9660; GFX7-NEXT:    v_mov_b32_e32 v1, s1
9661; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9662; GFX7-NEXT:    flat_store_dword v[0:1], v2
9663; GFX7-NEXT:    s_endpgm
9664;
9665; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9666; GFX10-WGP:       ; %bb.0: ; %entry
9667; GFX10-WGP-NEXT:    s_clause 0x1
9668; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9669; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9670; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9671; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
9672; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
9673; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9674; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
9675; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9676; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
9677; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9678; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9679; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9680; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9681; GFX10-WGP-NEXT:    buffer_gl0_inv
9682; GFX10-WGP-NEXT:    buffer_gl1_inv
9683; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
9684; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
9685; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9686; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9687; GFX10-WGP-NEXT:    s_endpgm
9688;
9689; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9690; GFX10-CU:       ; %bb.0: ; %entry
9691; GFX10-CU-NEXT:    s_clause 0x1
9692; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9693; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9694; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9695; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
9696; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
9697; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9698; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
9699; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9700; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
9701; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9702; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9703; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9704; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9705; GFX10-CU-NEXT:    buffer_gl0_inv
9706; GFX10-CU-NEXT:    buffer_gl1_inv
9707; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
9708; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
9709; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9710; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9711; GFX10-CU-NEXT:    s_endpgm
9712;
9713; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9714; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9715; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
9716; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9717; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9718; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
9719; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
9720; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9721; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
9722; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
9723; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
9724; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9725; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9726; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9727; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9728; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9729; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9730; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9731; SKIP-CACHE-INV-NEXT:    s_endpgm
9732;
9733; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9734; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9735; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9736; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9737; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9738; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9739; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9740; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9741; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9742; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9743; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9744; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9745; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9746; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9747; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9748; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9749;
9750; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
9751; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9752; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9753; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
9754; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9755; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9756; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
9757; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9758; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9759; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9760; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9761; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9762; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9763; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9764; GFX90A-TGSPLIT-NEXT:    s_endpgm
9765    i32* %out, i32 %in, i32 %old) {
9766entry:
9767  %gep = getelementptr i32, i32* %out, i32 4
9768  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
9769  %val0 = extractvalue { i32, i1 } %val, 0
9770  store i32 %val0, i32* %out, align 4
9771  ret void
9772}
9773
9774