1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx90a -o - %s | FileCheck %s
3
4%S = type <{ float, double }>
5
6; The result of that atomic ops should not be used as a uniform value.
7
8define protected amdgpu_kernel void @add(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
9; CHECK-LABEL: add:
10; CHECK:       ; %bb.0:
11; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
12; CHECK-NEXT:    v_mov_b32_e32 v0, 0
13; CHECK-NEXT:    v_mov_b32_e32 v1, 1
14; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
15; CHECK-NEXT:    global_atomic_add v2, v0, v1, s[0:1] glc
16; CHECK-NEXT:    v_mov_b32_e32 v0, s2
17; CHECK-NEXT:    v_mov_b32_e32 v1, s3
18; CHECK-NEXT:    s_waitcnt vmcnt(0)
19; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
20; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
21; CHECK-NEXT:    global_store_dword v[0:1], v2, off
22; CHECK-NEXT:    s_endpgm
23  %n32 = atomicrmw add i32 addrspace(1)* %p, i32 1 monotonic
24  %n64 = zext i32 %n32 to i64
25  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
26  store float 1.0, float addrspace(1)* %p1
27  ret void
28}
29
30define protected amdgpu_kernel void @sub(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
31; CHECK-LABEL: sub:
32; CHECK:       ; %bb.0:
33; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
34; CHECK-NEXT:    v_mov_b32_e32 v0, 0
35; CHECK-NEXT:    v_mov_b32_e32 v1, 1
36; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
37; CHECK-NEXT:    global_atomic_sub v2, v0, v1, s[0:1] glc
38; CHECK-NEXT:    v_mov_b32_e32 v0, s2
39; CHECK-NEXT:    v_mov_b32_e32 v1, s3
40; CHECK-NEXT:    s_waitcnt vmcnt(0)
41; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
42; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
43; CHECK-NEXT:    global_store_dword v[0:1], v2, off
44; CHECK-NEXT:    s_endpgm
45  %n32 = atomicrmw sub i32 addrspace(1)* %p, i32 1 monotonic
46  %n64 = zext i32 %n32 to i64
47  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
48  store float 1.0, float addrspace(1)* %p1
49  ret void
50}
51
52define protected amdgpu_kernel void @and(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
53; CHECK-LABEL: and:
54; CHECK:       ; %bb.0:
55; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
56; CHECK-NEXT:    v_mov_b32_e32 v0, 0
57; CHECK-NEXT:    v_mov_b32_e32 v1, 1
58; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
59; CHECK-NEXT:    global_atomic_and v2, v0, v1, s[0:1] glc
60; CHECK-NEXT:    v_mov_b32_e32 v0, s2
61; CHECK-NEXT:    v_mov_b32_e32 v1, s3
62; CHECK-NEXT:    s_waitcnt vmcnt(0)
63; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
64; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
65; CHECK-NEXT:    global_store_dword v[0:1], v2, off
66; CHECK-NEXT:    s_endpgm
67  %n32 = atomicrmw and i32 addrspace(1)* %p, i32 1 monotonic
68  %n64 = zext i32 %n32 to i64
69  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
70  store float 1.0, float addrspace(1)* %p1
71  ret void
72}
73
74define protected amdgpu_kernel void @or(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
75; CHECK-LABEL: or:
76; CHECK:       ; %bb.0:
77; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
78; CHECK-NEXT:    v_mov_b32_e32 v0, 0
79; CHECK-NEXT:    v_mov_b32_e32 v1, 1
80; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
81; CHECK-NEXT:    global_atomic_or v2, v0, v1, s[0:1] glc
82; CHECK-NEXT:    v_mov_b32_e32 v0, s2
83; CHECK-NEXT:    v_mov_b32_e32 v1, s3
84; CHECK-NEXT:    s_waitcnt vmcnt(0)
85; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
86; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
87; CHECK-NEXT:    global_store_dword v[0:1], v2, off
88; CHECK-NEXT:    s_endpgm
89  %n32 = atomicrmw or i32 addrspace(1)* %p, i32 1 monotonic
90  %n64 = zext i32 %n32 to i64
91  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
92  store float 1.0, float addrspace(1)* %p1
93  ret void
94}
95
96define protected amdgpu_kernel void @xor(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
97; CHECK-LABEL: xor:
98; CHECK:       ; %bb.0:
99; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
100; CHECK-NEXT:    v_mov_b32_e32 v0, 0
101; CHECK-NEXT:    v_mov_b32_e32 v1, 1
102; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
103; CHECK-NEXT:    global_atomic_xor v2, v0, v1, s[0:1] glc
104; CHECK-NEXT:    v_mov_b32_e32 v0, s2
105; CHECK-NEXT:    v_mov_b32_e32 v1, s3
106; CHECK-NEXT:    s_waitcnt vmcnt(0)
107; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
108; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
109; CHECK-NEXT:    global_store_dword v[0:1], v2, off
110; CHECK-NEXT:    s_endpgm
111  %n32 = atomicrmw xor i32 addrspace(1)* %p, i32 1 monotonic
112  %n64 = zext i32 %n32 to i64
113  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
114  store float 1.0, float addrspace(1)* %p1
115  ret void
116}
117
118define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
119; CHECK-LABEL: nand:
120; CHECK:       ; %bb.0:
121; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
122; CHECK-NEXT:    s_mov_b64 s[4:5], 0
123; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
124; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
125; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
126; CHECK-NEXT:    v_mov_b32_e32 v0, s6
127; CHECK-NEXT:  BB5_1: ; %atomicrmw.start
128; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
129; CHECK-NEXT:    v_mov_b32_e32 v1, v0
130; CHECK-NEXT:    v_not_b32_e32 v0, v1
131; CHECK-NEXT:    v_mov_b32_e32 v2, 0
132; CHECK-NEXT:    v_or_b32_e32 v0, -2, v0
133; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
134; CHECK-NEXT:    s_waitcnt vmcnt(0)
135; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
136; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
137; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
138; CHECK-NEXT:    s_cbranch_execnz BB5_1
139; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
140; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
141; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
142; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
143; CHECK-NEXT:    global_store_dword v[0:1], v2, off
144; CHECK-NEXT:    s_endpgm
145  %n32 = atomicrmw nand i32 addrspace(1)* %p, i32 1 monotonic
146  %n64 = zext i32 %n32 to i64
147  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
148  store float 1.0, float addrspace(1)* %p1
149  ret void
150}
151
152define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
153; CHECK-LABEL: max:
154; CHECK:       ; %bb.0:
155; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
156; CHECK-NEXT:    v_mov_b32_e32 v0, 0
157; CHECK-NEXT:    v_mov_b32_e32 v1, 1
158; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
159; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
160; CHECK-NEXT:    v_mov_b32_e32 v0, s2
161; CHECK-NEXT:    v_mov_b32_e32 v1, s3
162; CHECK-NEXT:    s_waitcnt vmcnt(0)
163; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
164; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
165; CHECK-NEXT:    global_store_dword v[0:1], v2, off
166; CHECK-NEXT:    s_endpgm
167  %n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 monotonic
168  %n64 = zext i32 %n32 to i64
169  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
170  store float 1.0, float addrspace(1)* %p1
171  ret void
172}
173
174define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
175; CHECK-LABEL: min:
176; CHECK:       ; %bb.0:
177; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
178; CHECK-NEXT:    v_mov_b32_e32 v0, 0
179; CHECK-NEXT:    v_mov_b32_e32 v1, 1
180; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
181; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
182; CHECK-NEXT:    v_mov_b32_e32 v0, s2
183; CHECK-NEXT:    v_mov_b32_e32 v1, s3
184; CHECK-NEXT:    s_waitcnt vmcnt(0)
185; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
186; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
187; CHECK-NEXT:    global_store_dword v[0:1], v2, off
188; CHECK-NEXT:    s_endpgm
189  %n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 monotonic
190  %n64 = zext i32 %n32 to i64
191  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
192  store float 1.0, float addrspace(1)* %p1
193  ret void
194}
195
196define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
197; CHECK-LABEL: umax:
198; CHECK:       ; %bb.0:
199; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
200; CHECK-NEXT:    v_mov_b32_e32 v0, 0
201; CHECK-NEXT:    v_mov_b32_e32 v1, 1
202; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
203; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
204; CHECK-NEXT:    v_mov_b32_e32 v0, s2
205; CHECK-NEXT:    v_mov_b32_e32 v1, s3
206; CHECK-NEXT:    s_waitcnt vmcnt(0)
207; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
208; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
209; CHECK-NEXT:    global_store_dword v[0:1], v2, off
210; CHECK-NEXT:    s_endpgm
211  %n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 monotonic
212  %n64 = zext i32 %n32 to i64
213  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
214  store float 1.0, float addrspace(1)* %p1
215  ret void
216}
217
218define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
219; CHECK-LABEL: umin:
220; CHECK:       ; %bb.0:
221; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
222; CHECK-NEXT:    v_mov_b32_e32 v0, 0
223; CHECK-NEXT:    v_mov_b32_e32 v1, 1
224; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
225; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
226; CHECK-NEXT:    v_mov_b32_e32 v0, s2
227; CHECK-NEXT:    v_mov_b32_e32 v1, s3
228; CHECK-NEXT:    s_waitcnt vmcnt(0)
229; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
230; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
231; CHECK-NEXT:    global_store_dword v[0:1], v2, off
232; CHECK-NEXT:    s_endpgm
233  %n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 monotonic
234  %n64 = zext i32 %n32 to i64
235  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
236  store float 1.0, float addrspace(1)* %p1
237  ret void
238}
239
240define protected amdgpu_kernel void @cmpxchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
241; CHECK-LABEL: cmpxchg:
242; CHECK:       ; %bb.0:
243; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
244; CHECK-NEXT:    v_mov_b32_e32 v2, 0
245; CHECK-NEXT:    v_mov_b32_e32 v0, 2
246; CHECK-NEXT:    v_mov_b32_e32 v1, 1
247; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
248; CHECK-NEXT:    global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
249; CHECK-NEXT:    v_mov_b32_e32 v0, s2
250; CHECK-NEXT:    v_mov_b32_e32 v1, s3
251; CHECK-NEXT:    s_waitcnt vmcnt(0)
252; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
253; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
254; CHECK-NEXT:    global_store_dword v[0:1], v2, off
255; CHECK-NEXT:    s_endpgm
256  %agg = cmpxchg i32 addrspace(1)* %p, i32 1, i32 2 monotonic monotonic
257  %n32 = extractvalue {i32, i1} %agg, 0
258  %n64 = zext i32 %n32 to i64
259  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
260  store float 1.0, float addrspace(1)* %p1
261  ret void
262}
263
264define protected amdgpu_kernel void @xchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
265; CHECK-LABEL: xchg:
266; CHECK:       ; %bb.0:
267; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
268; CHECK-NEXT:    v_mov_b32_e32 v0, 0
269; CHECK-NEXT:    v_mov_b32_e32 v1, 1
270; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
271; CHECK-NEXT:    global_atomic_swap v2, v0, v1, s[0:1] glc
272; CHECK-NEXT:    v_mov_b32_e32 v0, s2
273; CHECK-NEXT:    v_mov_b32_e32 v1, s3
274; CHECK-NEXT:    s_waitcnt vmcnt(0)
275; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
276; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
277; CHECK-NEXT:    global_store_dword v[0:1], v2, off
278; CHECK-NEXT:    s_endpgm
279  %n32 = atomicrmw xchg i32 addrspace(1)* %p, i32 1 monotonic
280  %n64 = zext i32 %n32 to i64
281  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
282  store float 1.0, float addrspace(1)* %p1
283  ret void
284}
285
286define protected amdgpu_kernel void @inc(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
287; CHECK-LABEL: inc:
288; CHECK:       ; %bb.0:
289; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
290; CHECK-NEXT:    v_mov_b32_e32 v0, 0
291; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
292; CHECK-NEXT:    global_atomic_inc v2, v0, v0, s[0:1] glc
293; CHECK-NEXT:    v_mov_b32_e32 v0, s2
294; CHECK-NEXT:    v_mov_b32_e32 v1, s3
295; CHECK-NEXT:    s_waitcnt vmcnt(0)
296; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
297; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
298; CHECK-NEXT:    global_store_dword v[0:1], v2, off
299; CHECK-NEXT:    s_endpgm
300  %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
301  %n64 = zext i32 %n32 to i64
302  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
303  store float 1.0, float addrspace(1)* %p1
304  ret void
305}
306
307define protected amdgpu_kernel void @dec(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
308; CHECK-LABEL: dec:
309; CHECK:       ; %bb.0:
310; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
311; CHECK-NEXT:    v_mov_b32_e32 v0, 0
312; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
313; CHECK-NEXT:    global_atomic_dec v2, v0, v0, s[0:1] glc
314; CHECK-NEXT:    v_mov_b32_e32 v0, s2
315; CHECK-NEXT:    v_mov_b32_e32 v1, s3
316; CHECK-NEXT:    s_waitcnt vmcnt(0)
317; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
318; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
319; CHECK-NEXT:    global_store_dword v[0:1], v2, off
320; CHECK-NEXT:    s_endpgm
321  %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
322  %n64 = zext i32 %n32 to i64
323  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
324  store float 1.0, float addrspace(1)* %p1
325  ret void
326}
327
328define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1)* %q) {
329; CHECK-LABEL: fadd:
330; CHECK:       ; %bb.0:
331; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
332; CHECK-NEXT:    s_mov_b64 s[4:5], 0
333; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
334; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
335; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
336; CHECK-NEXT:    v_mov_b32_e32 v0, s6
337; CHECK-NEXT:  BB14_1: ; %atomicrmw.start
338; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
339; CHECK-NEXT:    v_mov_b32_e32 v1, v0
340; CHECK-NEXT:    v_mov_b32_e32 v2, 0
341; CHECK-NEXT:    v_add_f32_e32 v0, 1.0, v1
342; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
343; CHECK-NEXT:    s_waitcnt vmcnt(0)
344; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
345; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
346; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
347; CHECK-NEXT:    s_cbranch_execnz BB14_1
348; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
349; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
350; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
351; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
352; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
353; CHECK-NEXT:    global_store_dword v[0:1], v2, off
354; CHECK-NEXT:    s_endpgm
355  %f32 = atomicrmw fadd float addrspace(1)* %p, float 1.0 monotonic
356  %n32 = fptoui float %f32 to i32
357  %n64 = zext i32 %n32 to i64
358  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
359  store float 1.0, float addrspace(1)* %p1
360  ret void
361}
362
363define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1)* %q) {
364; CHECK-LABEL: fsub:
365; CHECK:       ; %bb.0:
366; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
367; CHECK-NEXT:    s_mov_b64 s[4:5], 0
368; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
369; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
370; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
371; CHECK-NEXT:    v_mov_b32_e32 v0, s6
372; CHECK-NEXT:  BB15_1: ; %atomicrmw.start
373; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
374; CHECK-NEXT:    v_mov_b32_e32 v1, v0
375; CHECK-NEXT:    v_mov_b32_e32 v2, 0
376; CHECK-NEXT:    v_add_f32_e32 v0, -1.0, v1
377; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
378; CHECK-NEXT:    s_waitcnt vmcnt(0)
379; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
380; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
381; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
382; CHECK-NEXT:    s_cbranch_execnz BB15_1
383; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
384; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
385; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
386; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
387; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
388; CHECK-NEXT:    global_store_dword v[0:1], v2, off
389; CHECK-NEXT:    s_endpgm
390  %f32 = atomicrmw fsub float addrspace(1)* %p, float 1.0 monotonic
391  %n32 = fptoui float %f32 to i32
392  %n64 = zext i32 %n32 to i64
393  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
394  store float 1.0, float addrspace(1)* %p1
395  ret void
396}
397
398define protected amdgpu_kernel void @fmin(double addrspace(1)* %p, %S addrspace(1)* %q) {
399; CHECK-LABEL: fmin:
400; CHECK:       ; %bb.0:
401; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
402; CHECK-NEXT:    v_mov_b32_e32 v0, 0
403; CHECK-NEXT:    v_mov_b32_e32 v2, 0
404; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
405; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
406; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
407; CHECK-NEXT:    v_mov_b32_e32 v2, s2
408; CHECK-NEXT:    v_mov_b32_e32 v3, s3
409; CHECK-NEXT:    s_waitcnt vmcnt(0)
410; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
411; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
412; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
413; CHECK-NEXT:    global_store_dword v[0:1], v2, off
414; CHECK-NEXT:    s_endpgm
415  %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
416  %n32 = fptoui double %f64 to i32
417  %n64 = zext i32 %n32 to i64
418  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
419  store float 1.0, float addrspace(1)* %p1
420  ret void
421}
422
423define protected amdgpu_kernel void @fmax(double addrspace(1)* %p, %S addrspace(1)* %q) {
424; CHECK-LABEL: fmax:
425; CHECK:       ; %bb.0:
426; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
427; CHECK-NEXT:    v_mov_b32_e32 v0, 0
428; CHECK-NEXT:    v_mov_b32_e32 v2, 0
429; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
430; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
431; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
432; CHECK-NEXT:    v_mov_b32_e32 v2, s2
433; CHECK-NEXT:    v_mov_b32_e32 v3, s3
434; CHECK-NEXT:    s_waitcnt vmcnt(0)
435; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
436; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
437; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
438; CHECK-NEXT:    global_store_dword v[0:1], v2, off
439; CHECK-NEXT:    s_endpgm
440  %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
441  %n32 = fptoui double %f64 to i32
442  %n64 = zext i32 %n32 to i64
443  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
444  store float 1.0, float addrspace(1)* %p1
445  ret void
446}
447
448define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
449; CHECK-LABEL: buffer.atomic.swap:
450; CHECK:       ; %bb.0:
451; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
452; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
453; CHECK-NEXT:    v_mov_b32_e32 v0, 1
454; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
455; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
456; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
457; CHECK-NEXT:    v_mov_b32_e32 v1, s2
458; CHECK-NEXT:    buffer_atomic_swap v0, v1, s[4:7], 0 offen glc
459; CHECK-NEXT:    s_waitcnt vmcnt(0)
460; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
461; CHECK-NEXT:    global_store_dword v[0:1], v2, off
462; CHECK-NEXT:    s_endpgm
463  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
464  %n64 = zext i32 %n32 to i64
465  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
466  store float 1.0, float addrspace(1)* %p1
467  ret void
468}
469
470define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
471; CHECK-LABEL: buffer.atomic.add:
472; CHECK:       ; %bb.0:
473; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
474; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
475; CHECK-NEXT:    v_mov_b32_e32 v0, 1
476; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
477; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
478; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
479; CHECK-NEXT:    v_mov_b32_e32 v1, s2
480; CHECK-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 offen glc
481; CHECK-NEXT:    s_waitcnt vmcnt(0)
482; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
483; CHECK-NEXT:    global_store_dword v[0:1], v2, off
484; CHECK-NEXT:    s_endpgm
485  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
486  %n64 = zext i32 %n32 to i64
487  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
488  store float 1.0, float addrspace(1)* %p1
489  ret void
490}
491
492define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
493; CHECK-LABEL: buffer.atomic.sub:
494; CHECK:       ; %bb.0:
495; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
496; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
497; CHECK-NEXT:    v_mov_b32_e32 v0, 1
498; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
499; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
500; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
501; CHECK-NEXT:    v_mov_b32_e32 v1, s2
502; CHECK-NEXT:    buffer_atomic_sub v0, v1, s[4:7], 0 offen glc
503; CHECK-NEXT:    s_waitcnt vmcnt(0)
504; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
505; CHECK-NEXT:    global_store_dword v[0:1], v2, off
506; CHECK-NEXT:    s_endpgm
507  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
508  %n64 = zext i32 %n32 to i64
509  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
510  store float 1.0, float addrspace(1)* %p1
511  ret void
512}
513
514define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
515; CHECK-LABEL: buffer.atomic.smin:
516; CHECK:       ; %bb.0:
517; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
518; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
519; CHECK-NEXT:    v_mov_b32_e32 v0, 1
520; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
521; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
522; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
523; CHECK-NEXT:    v_mov_b32_e32 v1, s2
524; CHECK-NEXT:    buffer_atomic_smin v0, v1, s[4:7], 0 offen glc
525; CHECK-NEXT:    s_waitcnt vmcnt(0)
526; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
527; CHECK-NEXT:    global_store_dword v[0:1], v2, off
528; CHECK-NEXT:    s_endpgm
529  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
530  %n64 = zext i32 %n32 to i64
531  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
532  store float 1.0, float addrspace(1)* %p1
533  ret void
534}
535
536define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
537; CHECK-LABEL: buffer.atomic.smax:
538; CHECK:       ; %bb.0:
539; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
540; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
541; CHECK-NEXT:    v_mov_b32_e32 v0, 1
542; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
543; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
544; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
545; CHECK-NEXT:    v_mov_b32_e32 v1, s2
546; CHECK-NEXT:    buffer_atomic_smax v0, v1, s[4:7], 0 offen glc
547; CHECK-NEXT:    s_waitcnt vmcnt(0)
548; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
549; CHECK-NEXT:    global_store_dword v[0:1], v2, off
550; CHECK-NEXT:    s_endpgm
551  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
552  %n64 = zext i32 %n32 to i64
553  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
554  store float 1.0, float addrspace(1)* %p1
555  ret void
556}
557
558define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
559; CHECK-LABEL: buffer.atomic.umin:
560; CHECK:       ; %bb.0:
561; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
562; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
563; CHECK-NEXT:    v_mov_b32_e32 v0, 1
564; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
565; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
566; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
567; CHECK-NEXT:    v_mov_b32_e32 v1, s2
568; CHECK-NEXT:    buffer_atomic_umin v0, v1, s[4:7], 0 offen glc
569; CHECK-NEXT:    s_waitcnt vmcnt(0)
570; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
571; CHECK-NEXT:    global_store_dword v[0:1], v2, off
572; CHECK-NEXT:    s_endpgm
573  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
574  %n64 = zext i32 %n32 to i64
575  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
576  store float 1.0, float addrspace(1)* %p1
577  ret void
578}
579
580define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
581; CHECK-LABEL: buffer.atomic.umax:
582; CHECK:       ; %bb.0:
583; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
584; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
585; CHECK-NEXT:    v_mov_b32_e32 v0, 1
586; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
587; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
588; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
589; CHECK-NEXT:    v_mov_b32_e32 v1, s2
590; CHECK-NEXT:    buffer_atomic_umax v0, v1, s[4:7], 0 offen glc
591; CHECK-NEXT:    s_waitcnt vmcnt(0)
592; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
593; CHECK-NEXT:    global_store_dword v[0:1], v2, off
594; CHECK-NEXT:    s_endpgm
595  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
596  %n64 = zext i32 %n32 to i64
597  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
598  store float 1.0, float addrspace(1)* %p1
599  ret void
600}
601
602define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
603; CHECK-LABEL: buffer.atomic.and:
604; CHECK:       ; %bb.0:
605; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
606; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
607; CHECK-NEXT:    v_mov_b32_e32 v0, 1
608; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
609; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
610; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
611; CHECK-NEXT:    v_mov_b32_e32 v1, s2
612; CHECK-NEXT:    buffer_atomic_and v0, v1, s[4:7], 0 offen glc
613; CHECK-NEXT:    s_waitcnt vmcnt(0)
614; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
615; CHECK-NEXT:    global_store_dword v[0:1], v2, off
616; CHECK-NEXT:    s_endpgm
617  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
618  %n64 = zext i32 %n32 to i64
619  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
620  store float 1.0, float addrspace(1)* %p1
621  ret void
622}
623
624define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
625; CHECK-LABEL: buffer.atomic.or:
626; CHECK:       ; %bb.0:
627; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
628; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
629; CHECK-NEXT:    v_mov_b32_e32 v0, 1
630; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
631; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
632; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
633; CHECK-NEXT:    v_mov_b32_e32 v1, s2
634; CHECK-NEXT:    buffer_atomic_or v0, v1, s[4:7], 0 offen glc
635; CHECK-NEXT:    s_waitcnt vmcnt(0)
636; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
637; CHECK-NEXT:    global_store_dword v[0:1], v2, off
638; CHECK-NEXT:    s_endpgm
639  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
640  %n64 = zext i32 %n32 to i64
641  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
642  store float 1.0, float addrspace(1)* %p1
643  ret void
644}
645
646define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
647; CHECK-LABEL: buffer.atomic.xor:
648; CHECK:       ; %bb.0:
649; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
650; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
651; CHECK-NEXT:    v_mov_b32_e32 v0, 1
652; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
653; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
654; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
655; CHECK-NEXT:    v_mov_b32_e32 v1, s2
656; CHECK-NEXT:    buffer_atomic_xor v0, v1, s[4:7], 0 offen glc
657; CHECK-NEXT:    s_waitcnt vmcnt(0)
658; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
659; CHECK-NEXT:    global_store_dword v[0:1], v2, off
660; CHECK-NEXT:    s_endpgm
661  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
662  %n64 = zext i32 %n32 to i64
663  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
664  store float 1.0, float addrspace(1)* %p1
665  ret void
666}
667
668define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
669; CHECK-LABEL: buffer.atomic.inc:
670; CHECK:       ; %bb.0:
671; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
672; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
673; CHECK-NEXT:    v_mov_b32_e32 v0, 1
674; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
675; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
676; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
677; CHECK-NEXT:    v_mov_b32_e32 v1, s2
678; CHECK-NEXT:    buffer_atomic_inc v0, v1, s[4:7], 0 offen glc
679; CHECK-NEXT:    s_waitcnt vmcnt(0)
680; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
681; CHECK-NEXT:    global_store_dword v[0:1], v2, off
682; CHECK-NEXT:    s_endpgm
683  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
684  %n64 = zext i32 %n32 to i64
685  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
686  store float 1.0, float addrspace(1)* %p1
687  ret void
688}
689
690define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
691; CHECK-LABEL: buffer.atomic.dec:
692; CHECK:       ; %bb.0:
693; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
694; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
695; CHECK-NEXT:    v_mov_b32_e32 v0, 1
696; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
697; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
698; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
699; CHECK-NEXT:    v_mov_b32_e32 v1, s2
700; CHECK-NEXT:    buffer_atomic_dec v0, v1, s[4:7], 0 offen glc
701; CHECK-NEXT:    s_waitcnt vmcnt(0)
702; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
703; CHECK-NEXT:    global_store_dword v[0:1], v2, off
704; CHECK-NEXT:    s_endpgm
705  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
706  %n64 = zext i32 %n32 to i64
707  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
708  store float 1.0, float addrspace(1)* %p1
709  ret void
710}
711
712define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
713; CHECK-LABEL: buffer.atomic.cmpswap:
714; CHECK:       ; %bb.0:
715; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
716; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
717; CHECK-NEXT:    v_mov_b32_e32 v1, 2
718; CHECK-NEXT:    v_mov_b32_e32 v0, 1
719; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
720; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
721; CHECK-NEXT:    v_mov_b32_e32 v2, s2
722; CHECK-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
723; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
724; CHECK-NEXT:    s_waitcnt vmcnt(0)
725; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
726; CHECK-NEXT:    global_store_dword v[0:1], v2, off
727; CHECK-NEXT:    s_endpgm
728  %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
729  %n64 = zext i32 %n32 to i64
730  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
731  store float 1.0, float addrspace(1)* %p1
732  ret void
733}
734
735define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
736; CHECK-LABEL: buffer.atomic.fadd:
737; CHECK:       ; %bb.0:
738; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
739; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
740; CHECK-NEXT:    v_mov_b32_e32 v0, 1.0
741; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
742; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
743; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
744; CHECK-NEXT:    v_mov_b32_e32 v1, s2
745; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen glc
746; CHECK-NEXT:    s_waitcnt vmcnt(0)
747; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
748; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
749; CHECK-NEXT:    global_store_dword v[0:1], v2, off
750; CHECK-NEXT:    s_endpgm
751  %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
752  %n32 = fptoui float %f32 to i32
753  %n64 = zext i32 %n32 to i64
754  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
755  store float 1.0, float addrspace(1)* %p1
756  ret void
757}
758
759define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
760; CHECK-LABEL: buffer.atomic.fmin:
761; CHECK:       ; %bb.0:
762; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
763; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
764; CHECK-NEXT:    v_mov_b32_e32 v0, 0
765; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
766; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
767; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
768; CHECK-NEXT:    v_mov_b32_e32 v2, s2
769; CHECK-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
770; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
771; CHECK-NEXT:    s_waitcnt vmcnt(0)
772; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
773; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
774; CHECK-NEXT:    global_store_dword v[0:1], v2, off
775; CHECK-NEXT:    s_endpgm
776  %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
777  %n32 = fptoui double %f64 to i32
778  %n64 = zext i32 %n32 to i64
779  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
780  store float 1.0, float addrspace(1)* %p1
781  ret void
782}
783
784define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
785; CHECK-LABEL: buffer.atomic.fmax:
786; CHECK:       ; %bb.0:
787; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
788; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
789; CHECK-NEXT:    v_mov_b32_e32 v0, 0
790; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
791; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
792; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
793; CHECK-NEXT:    v_mov_b32_e32 v2, s2
794; CHECK-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
795; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
796; CHECK-NEXT:    s_waitcnt vmcnt(0)
797; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
798; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
799; CHECK-NEXT:    global_store_dword v[0:1], v2, off
800; CHECK-NEXT:    s_endpgm
801  %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
802  %n32 = fptoui double %f64 to i32
803  %n64 = zext i32 %n32 to i64
804  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
805  store float 1.0, float addrspace(1)* %p1
806  ret void
807}
808
809declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
810declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
811declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)*, double)
812declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)*, double)
813declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32)
814declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32)
815declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32)
816declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32)
817declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32)
818declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32)
819declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32)
820declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32)
821declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32)
822declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32)
823declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32)
824declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32)
825declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32)
826declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32)
827declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32)
828declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32)
829