1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
3; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
4; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
6
7define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 {
8; GFX900-LABEL: global_atomic_fadd_ret_f32:
9; GFX900:       ; %bb.0:
10; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GFX900-NEXT:    s_mov_b64 s[2:3], 0
12; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
14; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX900-NEXT:    v_mov_b32_e32 v0, s4
16; GFX900-NEXT:  BB0_1: ; %atomicrmw.start
17; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
18; GFX900-NEXT:    v_mov_b32_e32 v1, v0
19; GFX900-NEXT:    v_mov_b32_e32 v2, 0
20; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
21; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
22; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
23; GFX900-NEXT:    s_waitcnt vmcnt(0)
24; GFX900-NEXT:    buffer_wbinvl1_vol
25; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
26; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
27; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
28; GFX900-NEXT:    s_cbranch_execnz BB0_1
29; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
30; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
31; GFX900-NEXT:    global_store_dword v[0:1], v0, off
32; GFX900-NEXT:    s_endpgm
33;
34; GFX908-LABEL: global_atomic_fadd_ret_f32:
35; GFX908:       ; %bb.0:
36; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
37; GFX908-NEXT:    s_mov_b64 s[2:3], 0
38; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
40; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX908-NEXT:    v_mov_b32_e32 v0, s4
42; GFX908-NEXT:  BB0_1: ; %atomicrmw.start
43; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
44; GFX908-NEXT:    v_mov_b32_e32 v1, v0
45; GFX908-NEXT:    v_mov_b32_e32 v2, 0
46; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
47; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
48; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
49; GFX908-NEXT:    s_waitcnt vmcnt(0)
50; GFX908-NEXT:    buffer_wbinvl1_vol
51; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
52; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
53; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
54; GFX908-NEXT:    s_cbranch_execnz BB0_1
55; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
56; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
57; GFX908-NEXT:    global_store_dword v[0:1], v0, off
58; GFX908-NEXT:    s_endpgm
59;
60; GFX90A-LABEL: global_atomic_fadd_ret_f32:
61; GFX90A:       ; %bb.0:
62; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
63; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
64; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
66; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
68; GFX90A-NEXT:  BB0_1: ; %atomicrmw.start
69; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
70; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
71; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
72; GFX90A-NEXT:    v_add_f32_e32 v0, 4.0, v1
73; GFX90A-NEXT:    buffer_wbl2
74; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
75; GFX90A-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
76; GFX90A-NEXT:    s_waitcnt vmcnt(0)
77; GFX90A-NEXT:    buffer_invl2
78; GFX90A-NEXT:    buffer_wbinvl1_vol
79; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
80; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
81; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
82; GFX90A-NEXT:    s_cbranch_execnz BB0_1
83; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
84; GFX90A-NEXT:    s_or_b64 exec, exec, s[2:3]
85; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
86; GFX90A-NEXT:    s_endpgm
87;
88; GFX10-LABEL: global_atomic_fadd_ret_f32:
89; GFX10:       ; %bb.0:
90; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
91; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
93; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX10-NEXT:    v_mov_b32_e32 v0, s2
95; GFX10-NEXT:    s_mov_b32 s2, 0
96; GFX10-NEXT:  BB0_1: ; %atomicrmw.start
97; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
98; GFX10-NEXT:    v_mov_b32_e32 v1, v0
99; GFX10-NEXT:    v_mov_b32_e32 v2, 0
100; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
101; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
102; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
103; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
104; GFX10-NEXT:    s_waitcnt vmcnt(0)
105; GFX10-NEXT:    buffer_gl0_inv
106; GFX10-NEXT:    buffer_gl1_inv
107; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
108; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
109; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
110; GFX10-NEXT:    s_cbranch_execnz BB0_1
111; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
112; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
113; GFX10-NEXT:    global_store_dword v[0:1], v0, off
114; GFX10-NEXT:    s_endpgm
115  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
116  store float %result, float addrspace(1)* undef
117  ret void
118}
119
120define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 {
121; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
122; GFX900:       ; %bb.0:
123; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
124; GFX900-NEXT:    s_mov_b64 s[2:3], 0
125; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
127; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX900-NEXT:    v_mov_b32_e32 v0, s4
129; GFX900-NEXT:  BB1_1: ; %atomicrmw.start
130; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
131; GFX900-NEXT:    v_mov_b32_e32 v1, v0
132; GFX900-NEXT:    v_mov_b32_e32 v2, 0
133; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
134; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
135; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
136; GFX900-NEXT:    s_waitcnt vmcnt(0)
137; GFX900-NEXT:    buffer_wbinvl1_vol
138; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
139; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
140; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
141; GFX900-NEXT:    s_cbranch_execnz BB1_1
142; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
143; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
144; GFX900-NEXT:    global_store_dword v[0:1], v0, off
145; GFX900-NEXT:    s_endpgm
146;
147; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
148; GFX908:       ; %bb.0:
149; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; GFX908-NEXT:    s_mov_b64 s[2:3], 0
151; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
153; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX908-NEXT:    v_mov_b32_e32 v0, s4
155; GFX908-NEXT:  BB1_1: ; %atomicrmw.start
156; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
157; GFX908-NEXT:    v_mov_b32_e32 v1, v0
158; GFX908-NEXT:    v_mov_b32_e32 v2, 0
159; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
160; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
161; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
162; GFX908-NEXT:    s_waitcnt vmcnt(0)
163; GFX908-NEXT:    buffer_wbinvl1_vol
164; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
165; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
166; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
167; GFX908-NEXT:    s_cbranch_execnz BB1_1
168; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
169; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
170; GFX908-NEXT:    global_store_dword v[0:1], v0, off
171; GFX908-NEXT:    s_endpgm
172;
173; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
174; GFX90A:       ; %bb.0:
175; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
176; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
177; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
178; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
179; GFX90A-NEXT:    global_atomic_add_f32 v0, v0, v1, s[0:1] glc
180; GFX90A-NEXT:    s_waitcnt vmcnt(0)
181; GFX90A-NEXT:    buffer_wbinvl1_vol
182; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
183; GFX90A-NEXT:    s_endpgm
184;
185; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
186; GFX10:       ; %bb.0:
187; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
188; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
190; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX10-NEXT:    v_mov_b32_e32 v0, s2
192; GFX10-NEXT:    s_mov_b32 s2, 0
193; GFX10-NEXT:  BB1_1: ; %atomicrmw.start
194; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
195; GFX10-NEXT:    v_mov_b32_e32 v1, v0
196; GFX10-NEXT:    v_mov_b32_e32 v2, 0
197; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
198; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
199; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
200; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
201; GFX10-NEXT:    s_waitcnt vmcnt(0)
202; GFX10-NEXT:    buffer_gl0_inv
203; GFX10-NEXT:    buffer_gl1_inv
204; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
205; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
206; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
207; GFX10-NEXT:    s_cbranch_execnz BB1_1
208; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
209; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
210; GFX10-NEXT:    global_store_dword v[0:1], v0, off
211; GFX10-NEXT:    s_endpgm
212  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
213  store float %result, float addrspace(1)* undef
214  ret void
215}
216
217define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
218; GFX900-LABEL: global_atomic_fadd_noret_f32:
219; GFX900:       ; %bb.0:
220; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
221; GFX900-NEXT:    s_mov_b64 s[2:3], 0
222; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
224; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX900-NEXT:    v_mov_b32_e32 v1, s4
226; GFX900-NEXT:  BB2_1: ; %atomicrmw.start
227; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
228; GFX900-NEXT:    v_mov_b32_e32 v2, 0
229; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
230; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
231; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
232; GFX900-NEXT:    s_waitcnt vmcnt(0)
233; GFX900-NEXT:    buffer_wbinvl1_vol
234; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
235; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
236; GFX900-NEXT:    v_mov_b32_e32 v1, v0
237; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
238; GFX900-NEXT:    s_cbranch_execnz BB2_1
239; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
240; GFX900-NEXT:    s_endpgm
241;
242; GFX908-LABEL: global_atomic_fadd_noret_f32:
243; GFX908:       ; %bb.0:
244; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
245; GFX908-NEXT:    v_mov_b32_e32 v0, 0
246; GFX908-NEXT:    v_mov_b32_e32 v1, 4.0
247; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
248; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
249; GFX908-NEXT:    s_waitcnt vmcnt(0)
250; GFX908-NEXT:    buffer_wbinvl1_vol
251; GFX908-NEXT:    s_endpgm
252;
253; GFX90A-LABEL: global_atomic_fadd_noret_f32:
254; GFX90A:       ; %bb.0:
255; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
256; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
257; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
258; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
259; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
260; GFX90A-NEXT:    s_waitcnt vmcnt(0)
261; GFX90A-NEXT:    buffer_wbinvl1_vol
262; GFX90A-NEXT:    s_endpgm
263;
264; GFX10-LABEL: global_atomic_fadd_noret_f32:
265; GFX10:       ; %bb.0:
266; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
267; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
269; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX10-NEXT:    v_mov_b32_e32 v1, s2
271; GFX10-NEXT:    s_mov_b32 s2, 0
272; GFX10-NEXT:  BB2_1: ; %atomicrmw.start
273; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
274; GFX10-NEXT:    v_mov_b32_e32 v2, 0
275; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
276; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
277; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
278; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
279; GFX10-NEXT:    s_waitcnt vmcnt(0)
280; GFX10-NEXT:    buffer_gl0_inv
281; GFX10-NEXT:    buffer_gl1_inv
282; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
283; GFX10-NEXT:    v_mov_b32_e32 v1, v0
284; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
285; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
286; GFX10-NEXT:    s_cbranch_execnz BB2_1
287; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
288; GFX10-NEXT:    s_endpgm
289  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
290  ret void
291}
292
293define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 {
294; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
295; GFX900:       ; %bb.0:
296; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
297; GFX900-NEXT:    s_mov_b64 s[2:3], 0
298; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
300; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX900-NEXT:    v_mov_b32_e32 v1, s4
302; GFX900-NEXT:  BB3_1: ; %atomicrmw.start
303; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
304; GFX900-NEXT:    v_mov_b32_e32 v2, 0
305; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
306; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
307; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
308; GFX900-NEXT:    s_waitcnt vmcnt(0)
309; GFX900-NEXT:    buffer_wbinvl1_vol
310; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
311; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
312; GFX900-NEXT:    v_mov_b32_e32 v1, v0
313; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
314; GFX900-NEXT:    s_cbranch_execnz BB3_1
315; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
316; GFX900-NEXT:    s_endpgm
317;
318; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
319; GFX908:       ; %bb.0:
320; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
321; GFX908-NEXT:    v_mov_b32_e32 v0, 0
322; GFX908-NEXT:    v_mov_b32_e32 v1, 4.0
323; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
324; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
325; GFX908-NEXT:    s_waitcnt vmcnt(0)
326; GFX908-NEXT:    buffer_wbinvl1_vol
327; GFX908-NEXT:    s_endpgm
328;
329; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee:
330; GFX90A:       ; %bb.0:
331; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
332; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
333; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
334; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
335; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
336; GFX90A-NEXT:    s_waitcnt vmcnt(0)
337; GFX90A-NEXT:    buffer_wbinvl1_vol
338; GFX90A-NEXT:    s_endpgm
339;
340; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
341; GFX10:       ; %bb.0:
342; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
343; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
345; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX10-NEXT:    v_mov_b32_e32 v1, s2
347; GFX10-NEXT:    s_mov_b32 s2, 0
348; GFX10-NEXT:  BB3_1: ; %atomicrmw.start
349; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
350; GFX10-NEXT:    v_mov_b32_e32 v2, 0
351; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
352; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
353; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
354; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
355; GFX10-NEXT:    s_waitcnt vmcnt(0)
356; GFX10-NEXT:    buffer_gl0_inv
357; GFX10-NEXT:    buffer_gl1_inv
358; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
359; GFX10-NEXT:    v_mov_b32_e32 v1, v0
360; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
361; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
362; GFX10-NEXT:    s_cbranch_execnz BB3_1
363; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
364; GFX10-NEXT:    s_endpgm
365  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
366  ret void
367}
368
369define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 {
370; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
371; GFX900:       ; %bb.0:
372; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
373; GFX900-NEXT:    s_mov_b64 s[2:3], 0
374; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
376; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX900-NEXT:    v_mov_b32_e32 v0, s4
378; GFX900-NEXT:  BB4_1: ; %atomicrmw.start
379; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
380; GFX900-NEXT:    v_mov_b32_e32 v1, v0
381; GFX900-NEXT:    v_mov_b32_e32 v2, 0
382; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
383; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
384; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
385; GFX900-NEXT:    s_waitcnt vmcnt(0)
386; GFX900-NEXT:    buffer_wbinvl1_vol
387; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
388; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
389; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
390; GFX900-NEXT:    s_cbranch_execnz BB4_1
391; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
392; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
393; GFX900-NEXT:    global_store_dword v[0:1], v0, off
394; GFX900-NEXT:    s_endpgm
395;
396; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
397; GFX908:       ; %bb.0:
398; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
399; GFX908-NEXT:    s_mov_b64 s[2:3], 0
400; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
402; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX908-NEXT:    v_mov_b32_e32 v0, s4
404; GFX908-NEXT:  BB4_1: ; %atomicrmw.start
405; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
406; GFX908-NEXT:    v_mov_b32_e32 v1, v0
407; GFX908-NEXT:    v_mov_b32_e32 v2, 0
408; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
409; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
410; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
411; GFX908-NEXT:    s_waitcnt vmcnt(0)
412; GFX908-NEXT:    buffer_wbinvl1_vol
413; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
414; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
415; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
416; GFX908-NEXT:    s_cbranch_execnz BB4_1
417; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
418; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
419; GFX908-NEXT:    global_store_dword v[0:1], v0, off
420; GFX908-NEXT:    s_endpgm
421;
422; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
423; GFX90A:       ; %bb.0:
424; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
425; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
426; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
427; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
428; GFX90A-NEXT:    global_atomic_add_f32 v0, v0, v1, s[0:1] glc
429; GFX90A-NEXT:    s_waitcnt vmcnt(0)
430; GFX90A-NEXT:    buffer_wbinvl1_vol
431; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
432; GFX90A-NEXT:    s_endpgm
433;
434; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
435; GFX10:       ; %bb.0:
436; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
437; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
439; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
440; GFX10-NEXT:    v_mov_b32_e32 v0, s2
441; GFX10-NEXT:    s_mov_b32 s2, 0
442; GFX10-NEXT:  BB4_1: ; %atomicrmw.start
443; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
444; GFX10-NEXT:    v_mov_b32_e32 v1, v0
445; GFX10-NEXT:    v_mov_b32_e32 v2, 0
446; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
447; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
448; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
449; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
450; GFX10-NEXT:    s_waitcnt vmcnt(0)
451; GFX10-NEXT:    buffer_gl0_inv
452; GFX10-NEXT:    buffer_gl1_inv
453; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
454; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
455; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
456; GFX10-NEXT:    s_cbranch_execnz BB4_1
457; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
458; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
459; GFX10-NEXT:    global_store_dword v[0:1], v0, off
460; GFX10-NEXT:    s_endpgm
461  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
462  store float %result, float addrspace(1)* undef
463  ret void
464}
465
466define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 {
467; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
468; GFX900:       ; %bb.0:
469; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
470; GFX900-NEXT:    s_mov_b64 s[2:3], 0
471; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
473; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX900-NEXT:    v_mov_b32_e32 v0, s4
475; GFX900-NEXT:  BB5_1: ; %atomicrmw.start
476; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
477; GFX900-NEXT:    v_mov_b32_e32 v1, v0
478; GFX900-NEXT:    v_mov_b32_e32 v2, 0
479; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
480; GFX900-NEXT:    s_waitcnt vmcnt(0)
481; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
482; GFX900-NEXT:    s_waitcnt vmcnt(0)
483; GFX900-NEXT:    buffer_wbinvl1_vol
484; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
485; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
486; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
487; GFX900-NEXT:    s_cbranch_execnz BB5_1
488; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
489; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
490; GFX900-NEXT:    global_store_dword v[0:1], v0, off
491; GFX900-NEXT:    s_endpgm
492;
493; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
494; GFX908:       ; %bb.0:
495; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
496; GFX908-NEXT:    s_mov_b64 s[2:3], 0
497; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
498; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
499; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX908-NEXT:    v_mov_b32_e32 v0, s4
501; GFX908-NEXT:  BB5_1: ; %atomicrmw.start
502; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
503; GFX908-NEXT:    v_mov_b32_e32 v1, v0
504; GFX908-NEXT:    v_mov_b32_e32 v2, 0
505; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
506; GFX908-NEXT:    s_waitcnt vmcnt(0)
507; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
508; GFX908-NEXT:    s_waitcnt vmcnt(0)
509; GFX908-NEXT:    buffer_wbinvl1_vol
510; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
511; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
512; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
513; GFX908-NEXT:    s_cbranch_execnz BB5_1
514; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
515; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
516; GFX908-NEXT:    global_store_dword v[0:1], v0, off
517; GFX908-NEXT:    s_endpgm
518;
519; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
520; GFX90A:       ; %bb.0:
521; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
522; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
523; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
525; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
527; GFX90A-NEXT:  BB5_1: ; %atomicrmw.start
528; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
529; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
530; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
531; GFX90A-NEXT:    v_add_f32_e32 v0, 4.0, v1
532; GFX90A-NEXT:    buffer_wbl2
533; GFX90A-NEXT:    s_waitcnt vmcnt(0)
534; GFX90A-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
535; GFX90A-NEXT:    s_waitcnt vmcnt(0)
536; GFX90A-NEXT:    buffer_invl2
537; GFX90A-NEXT:    buffer_wbinvl1_vol
538; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
539; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
540; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
541; GFX90A-NEXT:    s_cbranch_execnz BB5_1
542; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
543; GFX90A-NEXT:    s_or_b64 exec, exec, s[2:3]
544; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
545; GFX90A-NEXT:    s_endpgm
546;
547; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
548; GFX10:       ; %bb.0:
549; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
550; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
552; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX10-NEXT:    v_mov_b32_e32 v0, s2
554; GFX10-NEXT:    s_mov_b32 s2, 0
555; GFX10-NEXT:  BB5_1: ; %atomicrmw.start
556; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
557; GFX10-NEXT:    v_mov_b32_e32 v1, v0
558; GFX10-NEXT:    v_mov_b32_e32 v2, 0
559; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
560; GFX10-NEXT:    s_waitcnt vmcnt(0)
561; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
562; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
563; GFX10-NEXT:    s_waitcnt vmcnt(0)
564; GFX10-NEXT:    buffer_gl0_inv
565; GFX10-NEXT:    buffer_gl1_inv
566; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
567; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
568; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
569; GFX10-NEXT:    s_cbranch_execnz BB5_1
570; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
571; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
572; GFX10-NEXT:    global_store_dword v[0:1], v0, off
573; GFX10-NEXT:    s_endpgm
574  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst
575  store float %result, float addrspace(1)* undef
576  ret void
577}
578
579define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
580; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
581; GCN:       ; %bb.0:
582; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
583; GCN-NEXT:    s_mov_b64 s[2:3], 0
584; GCN-NEXT:    s_waitcnt lgkmcnt(0)
585; GCN-NEXT:    s_load_dword s4, s[0:1], 0x0
586; GCN-NEXT:    s_waitcnt lgkmcnt(0)
587; GCN-NEXT:    v_mov_b32_e32 v0, s4
588; GCN-NEXT:  BB6_1: ; %atomicrmw.start
589; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
590; GCN-NEXT:    v_mov_b32_e32 v1, v0
591; GCN-NEXT:    v_mov_b32_e32 v2, 0
592; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v1
593; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
594; GCN-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
595; GCN-NEXT:    s_waitcnt vmcnt(0)
596; GCN-NEXT:    buffer_wbinvl1_vol
597; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
598; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
599; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
600; GCN-NEXT:    s_cbranch_execnz BB6_1
601; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
602; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
603; GCN-NEXT:    global_store_dword v[0:1], v0, off
604; GCN-NEXT:    s_endpgm
605  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
606  store float %result, float addrspace(1)* undef
607  ret void
608}
609
610define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
611; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
612; GCN:       ; %bb.0:
613; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
614; GCN-NEXT:    v_mov_b32_e32 v0, 0
615; GCN-NEXT:    v_mov_b32_e32 v1, 4.0
616; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
617; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
618; GCN-NEXT:    s_waitcnt vmcnt(0)
619; GCN-NEXT:    buffer_wbinvl1_vol
620; GCN-NEXT:    s_endpgm
621  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
622  ret void
623}
624
625define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)* %ptr) {
626; GFX900-LABEL: global_atomic_fadd_noret_f32_safe:
627; GFX900:       ; %bb.0:
628; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
629; GFX900-NEXT:    s_mov_b64 s[2:3], 0
630; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
632; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX900-NEXT:    v_mov_b32_e32 v1, s4
634; GFX900-NEXT:  BB8_1: ; %atomicrmw.start
635; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
636; GFX900-NEXT:    v_mov_b32_e32 v2, 0
637; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
638; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
639; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
640; GFX900-NEXT:    s_waitcnt vmcnt(0)
641; GFX900-NEXT:    buffer_wbinvl1_vol
642; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
643; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
644; GFX900-NEXT:    v_mov_b32_e32 v1, v0
645; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
646; GFX900-NEXT:    s_cbranch_execnz BB8_1
647; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
648; GFX900-NEXT:    s_endpgm
649;
650; GFX908-LABEL: global_atomic_fadd_noret_f32_safe:
651; GFX908:       ; %bb.0:
652; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
653; GFX908-NEXT:    s_mov_b64 s[2:3], 0
654; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
656; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX908-NEXT:    v_mov_b32_e32 v1, s4
658; GFX908-NEXT:  BB8_1: ; %atomicrmw.start
659; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
660; GFX908-NEXT:    v_mov_b32_e32 v2, 0
661; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
662; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
663; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
664; GFX908-NEXT:    s_waitcnt vmcnt(0)
665; GFX908-NEXT:    buffer_wbinvl1_vol
666; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
667; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
668; GFX908-NEXT:    v_mov_b32_e32 v1, v0
669; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
670; GFX908-NEXT:    s_cbranch_execnz BB8_1
671; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
672; GFX908-NEXT:    s_endpgm
673;
674; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe:
675; GFX90A:       ; %bb.0:
676; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
677; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
678; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
679; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
680; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
682; GFX90A-NEXT:  BB8_1: ; %atomicrmw.start
683; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
684; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
685; GFX90A-NEXT:    v_add_f32_e32 v0, 4.0, v1
686; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
687; GFX90A-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
688; GFX90A-NEXT:    s_waitcnt vmcnt(0)
689; GFX90A-NEXT:    buffer_wbinvl1_vol
690; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
691; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
692; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
693; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
694; GFX90A-NEXT:    s_cbranch_execnz BB8_1
695; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
696; GFX90A-NEXT:    s_endpgm
697;
698; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
699; GFX10:       ; %bb.0:
700; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
701; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
703; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX10-NEXT:    v_mov_b32_e32 v1, s2
705; GFX10-NEXT:    s_mov_b32 s2, 0
706; GFX10-NEXT:  BB8_1: ; %atomicrmw.start
707; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
708; GFX10-NEXT:    v_mov_b32_e32 v2, 0
709; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
710; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
711; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
712; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
713; GFX10-NEXT:    s_waitcnt vmcnt(0)
714; GFX10-NEXT:    buffer_gl0_inv
715; GFX10-NEXT:    buffer_gl1_inv
716; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
717; GFX10-NEXT:    v_mov_b32_e32 v1, v0
718; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
719; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
720; GFX10-NEXT:    s_cbranch_execnz BB8_1
721; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
722; GFX10-NEXT:    s_endpgm
723  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
724  ret void
725}
726
727define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0 {
728; GFX900-LABEL: infer_as_before_atomic:
729; GFX900:       ; %bb.0:
730; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
731; GFX900-NEXT:    s_mov_b64 s[2:3], 0
732; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
734; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
736; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX900-NEXT:    v_mov_b32_e32 v1, s4
738; GFX900-NEXT:  BB9_1: ; %atomicrmw.start
739; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
740; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v1
741; GFX900-NEXT:    v_mov_b32_e32 v2, 0
742; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
743; GFX900-NEXT:    s_waitcnt vmcnt(0)
744; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
745; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
746; GFX900-NEXT:    v_mov_b32_e32 v1, v0
747; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
748; GFX900-NEXT:    s_cbranch_execnz BB9_1
749; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
750; GFX900-NEXT:    s_endpgm
751;
752; GFX908-LABEL: infer_as_before_atomic:
753; GFX908:       ; %bb.0:
754; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
755; GFX908-NEXT:    v_mov_b32_e32 v0, 0
756; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
757; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
758; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
759; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
761; GFX908-NEXT:    s_endpgm
762;
763; GFX90A-LABEL: infer_as_before_atomic:
764; GFX90A:       ; %bb.0:
765; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
766; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
767; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
768; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
769; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
770; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
772; GFX90A-NEXT:    s_endpgm
773;
774; GFX10-LABEL: infer_as_before_atomic:
775; GFX10:       ; %bb.0:
776; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
777; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
778; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
779; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
781; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX10-NEXT:    v_mov_b32_e32 v1, s2
783; GFX10-NEXT:    s_mov_b32 s2, 0
784; GFX10-NEXT:  BB9_1: ; %atomicrmw.start
785; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
786; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v1
787; GFX10-NEXT:    v_mov_b32_e32 v2, 0
788; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
789; GFX10-NEXT:    s_waitcnt vmcnt(0)
790; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
791; GFX10-NEXT:    v_mov_b32_e32 v1, v0
792; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
793; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
794; GFX10-NEXT:    s_cbranch_execnz BB9_1
795; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
796; GFX10-NEXT:    s_endpgm
797  %load = load float*, float* addrspace(4)* %arg
798  %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
799  ret void
800}
801
802attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
803attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" }
804attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }
805