1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
6
7; FIXME: Merge with other test. DS offset folding doesn't work due to
8; register bank copies, and no return optimization is missing.
9
10
11declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
12declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
13declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2
14
15declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
16declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
17declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2
18
19declare i32 @llvm.amdgcn.workitem.id.x() #1
20
21define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
22; CI-LABEL: lds_atomic_inc_ret_i32:
23; CI:       ; %bb.0:
24; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
25; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
26; CI-NEXT:    v_mov_b32_e32 v0, 42
27; CI-NEXT:    s_mov_b32 m0, -1
28; CI-NEXT:    s_waitcnt lgkmcnt(0)
29; CI-NEXT:    v_mov_b32_e32 v1, s2
30; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
31; CI-NEXT:    v_mov_b32_e32 v0, s0
32; CI-NEXT:    v_mov_b32_e32 v1, s1
33; CI-NEXT:    s_waitcnt lgkmcnt(0)
34; CI-NEXT:    flat_store_dword v[0:1], v2
35; CI-NEXT:    s_endpgm
36;
37; VI-LABEL: lds_atomic_inc_ret_i32:
38; VI:       ; %bb.0:
39; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
40; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
41; VI-NEXT:    v_mov_b32_e32 v0, 42
42; VI-NEXT:    s_mov_b32 m0, -1
43; VI-NEXT:    s_waitcnt lgkmcnt(0)
44; VI-NEXT:    v_mov_b32_e32 v1, s2
45; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
46; VI-NEXT:    v_mov_b32_e32 v0, s0
47; VI-NEXT:    v_mov_b32_e32 v1, s1
48; VI-NEXT:    s_waitcnt lgkmcnt(0)
49; VI-NEXT:    flat_store_dword v[0:1], v2
50; VI-NEXT:    s_endpgm
51;
52; GFX9-LABEL: lds_atomic_inc_ret_i32:
53; GFX9:       ; %bb.0:
54; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
55; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
56; GFX9-NEXT:    v_mov_b32_e32 v1, 42
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    v_mov_b32_e32 v0, s2
59; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v1
60; GFX9-NEXT:    v_mov_b32_e32 v1, 0
61; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
63; GFX9-NEXT:    s_endpgm
64;
65; GFX10-LABEL: lds_atomic_inc_ret_i32:
66; GFX10:       ; %bb.0:
67; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
68; GFX10-NEXT:    v_mov_b32_e32 v1, 42
69; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX10-NEXT:    v_mov_b32_e32 v0, s0
71; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
72; GFX10-NEXT:    ds_inc_rtn_u32 v0, v0, v1
73; GFX10-NEXT:    v_mov_b32_e32 v1, 0
74; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
76; GFX10-NEXT:    s_endpgm
77  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0
78  store i32 %result, i32 addrspace(1)* %out
79  ret void
80}
81
82!0 = !{!1}
83!1 = distinct !{!1, !2}
84!2 = distinct !{!2}
85
86define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
87; CI-LABEL: lds_atomic_inc_ret_i32_offset:
88; CI:       ; %bb.0:
89; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
90; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
91; CI-NEXT:    v_mov_b32_e32 v0, 42
92; CI-NEXT:    s_mov_b32 m0, -1
93; CI-NEXT:    s_waitcnt lgkmcnt(0)
94; CI-NEXT:    v_mov_b32_e32 v1, s2
95; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
96; CI-NEXT:    v_mov_b32_e32 v0, s0
97; CI-NEXT:    v_mov_b32_e32 v1, s1
98; CI-NEXT:    s_waitcnt lgkmcnt(0)
99; CI-NEXT:    flat_store_dword v[0:1], v2
100; CI-NEXT:    s_endpgm
101;
102; VI-LABEL: lds_atomic_inc_ret_i32_offset:
103; VI:       ; %bb.0:
104; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
105; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
106; VI-NEXT:    v_mov_b32_e32 v0, 42
107; VI-NEXT:    s_mov_b32 m0, -1
108; VI-NEXT:    s_waitcnt lgkmcnt(0)
109; VI-NEXT:    v_mov_b32_e32 v1, s2
110; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
111; VI-NEXT:    v_mov_b32_e32 v0, s0
112; VI-NEXT:    v_mov_b32_e32 v1, s1
113; VI-NEXT:    s_waitcnt lgkmcnt(0)
114; VI-NEXT:    flat_store_dword v[0:1], v2
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: lds_atomic_inc_ret_i32_offset:
118; GFX9:       ; %bb.0:
119; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
120; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
121; GFX9-NEXT:    v_mov_b32_e32 v0, 42
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    v_mov_b32_e32 v1, s2
124; GFX9-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
125; GFX9-NEXT:    v_mov_b32_e32 v1, 0
126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
128; GFX9-NEXT:    s_endpgm
129;
130; GFX10-LABEL: lds_atomic_inc_ret_i32_offset:
131; GFX10:       ; %bb.0:
132; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
133; GFX10-NEXT:    v_mov_b32_e32 v0, 42
134; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX10-NEXT:    v_mov_b32_e32 v1, s0
136; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
137; GFX10-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
138; GFX10-NEXT:    v_mov_b32_e32 v1, 0
139; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
141; GFX10-NEXT:    s_endpgm
142  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
143  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
144  store i32 %result, i32 addrspace(1)* %out
145  ret void
146}
147
148define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
149; CI-LABEL: lds_atomic_inc_noret_i32:
150; CI:       ; %bb.0:
151; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
152; CI-NEXT:    v_mov_b32_e32 v0, 42
153; CI-NEXT:    s_mov_b32 m0, -1
154; CI-NEXT:    s_waitcnt lgkmcnt(0)
155; CI-NEXT:    v_mov_b32_e32 v1, s0
156; CI-NEXT:    ds_inc_rtn_u32 v0, v1, v0
157; CI-NEXT:    s_endpgm
158;
159; VI-LABEL: lds_atomic_inc_noret_i32:
160; VI:       ; %bb.0:
161; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
162; VI-NEXT:    v_mov_b32_e32 v0, 42
163; VI-NEXT:    s_mov_b32 m0, -1
164; VI-NEXT:    s_waitcnt lgkmcnt(0)
165; VI-NEXT:    v_mov_b32_e32 v1, s0
166; VI-NEXT:    ds_inc_rtn_u32 v0, v1, v0
167; VI-NEXT:    s_endpgm
168;
169; GFX9-LABEL: lds_atomic_inc_noret_i32:
170; GFX9:       ; %bb.0:
171; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
172; GFX9-NEXT:    v_mov_b32_e32 v1, 42
173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX9-NEXT:    v_mov_b32_e32 v0, s0
175; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v1
176; GFX9-NEXT:    s_endpgm
177;
178; GFX10-LABEL: lds_atomic_inc_noret_i32:
179; GFX10:       ; %bb.0:
180; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
181; GFX10-NEXT:    v_mov_b32_e32 v1, 42
182; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX10-NEXT:    v_mov_b32_e32 v0, s0
184; GFX10-NEXT:    ds_inc_rtn_u32 v0, v0, v1
185; GFX10-NEXT:    s_endpgm
186  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
187  ret void
188}
189
190define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
191; CI-LABEL: lds_atomic_inc_noret_i32_offset:
192; CI:       ; %bb.0:
193; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
194; CI-NEXT:    v_mov_b32_e32 v0, 42
195; CI-NEXT:    s_mov_b32 m0, -1
196; CI-NEXT:    s_waitcnt lgkmcnt(0)
197; CI-NEXT:    v_mov_b32_e32 v1, s0
198; CI-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
199; CI-NEXT:    s_endpgm
200;
201; VI-LABEL: lds_atomic_inc_noret_i32_offset:
202; VI:       ; %bb.0:
203; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
204; VI-NEXT:    v_mov_b32_e32 v0, 42
205; VI-NEXT:    s_mov_b32 m0, -1
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    v_mov_b32_e32 v1, s0
208; VI-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
209; VI-NEXT:    s_endpgm
210;
211; GFX9-LABEL: lds_atomic_inc_noret_i32_offset:
212; GFX9:       ; %bb.0:
213; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
214; GFX9-NEXT:    v_mov_b32_e32 v0, 42
215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX9-NEXT:    v_mov_b32_e32 v1, s0
217; GFX9-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
218; GFX9-NEXT:    s_endpgm
219;
220; GFX10-LABEL: lds_atomic_inc_noret_i32_offset:
221; GFX10:       ; %bb.0:
222; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
223; GFX10-NEXT:    v_mov_b32_e32 v0, 42
224; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX10-NEXT:    v_mov_b32_e32 v1, s0
226; GFX10-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
227; GFX10-NEXT:    s_endpgm
228  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
229  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
230  ret void
231}
232
233define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
234; CI-LABEL: global_atomic_inc_ret_i32:
235; CI:       ; %bb.0:
236; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
237; CI-NEXT:    v_mov_b32_e32 v2, 42
238; CI-NEXT:    s_waitcnt lgkmcnt(0)
239; CI-NEXT:    v_mov_b32_e32 v0, s2
240; CI-NEXT:    v_mov_b32_e32 v1, s3
241; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
242; CI-NEXT:    v_mov_b32_e32 v0, s0
243; CI-NEXT:    v_mov_b32_e32 v1, s1
244; CI-NEXT:    s_waitcnt vmcnt(0)
245; CI-NEXT:    flat_store_dword v[0:1], v2
246; CI-NEXT:    s_endpgm
247;
248; VI-LABEL: global_atomic_inc_ret_i32:
249; VI:       ; %bb.0:
250; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
251; VI-NEXT:    v_mov_b32_e32 v2, 42
252; VI-NEXT:    s_waitcnt lgkmcnt(0)
253; VI-NEXT:    v_mov_b32_e32 v0, s2
254; VI-NEXT:    v_mov_b32_e32 v1, s3
255; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
256; VI-NEXT:    v_mov_b32_e32 v0, s0
257; VI-NEXT:    v_mov_b32_e32 v1, s1
258; VI-NEXT:    s_waitcnt vmcnt(0)
259; VI-NEXT:    flat_store_dword v[0:1], v2
260; VI-NEXT:    s_endpgm
261;
262; GFX9-LABEL: global_atomic_inc_ret_i32:
263; GFX9:       ; %bb.0:
264; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
265; GFX9-NEXT:    v_mov_b32_e32 v0, 42
266; GFX9-NEXT:    v_mov_b32_e32 v1, 0
267; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] glc
269; GFX9-NEXT:    s_waitcnt vmcnt(0)
270; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
271; GFX9-NEXT:    s_endpgm
272;
273; GFX10-LABEL: global_atomic_inc_ret_i32:
274; GFX10:       ; %bb.0:
275; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
276; GFX10-NEXT:    v_mov_b32_e32 v0, 42
277; GFX10-NEXT:    v_mov_b32_e32 v1, 0
278; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] glc
280; GFX10-NEXT:    s_waitcnt vmcnt(0)
281; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
282; GFX10-NEXT:    s_endpgm
283  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
284  store i32 %result, i32 addrspace(1)* %out
285  ret void
286}
287
288define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
289; CI-LABEL: global_atomic_inc_ret_i32_offset:
290; CI:       ; %bb.0:
291; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
292; CI-NEXT:    v_mov_b32_e32 v2, 42
293; CI-NEXT:    s_waitcnt lgkmcnt(0)
294; CI-NEXT:    s_add_u32 s2, s2, 16
295; CI-NEXT:    s_addc_u32 s3, s3, 0
296; CI-NEXT:    v_mov_b32_e32 v0, s2
297; CI-NEXT:    v_mov_b32_e32 v1, s3
298; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
299; CI-NEXT:    v_mov_b32_e32 v0, s0
300; CI-NEXT:    v_mov_b32_e32 v1, s1
301; CI-NEXT:    s_waitcnt vmcnt(0)
302; CI-NEXT:    flat_store_dword v[0:1], v2
303; CI-NEXT:    s_endpgm
304;
305; VI-LABEL: global_atomic_inc_ret_i32_offset:
306; VI:       ; %bb.0:
307; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
308; VI-NEXT:    v_mov_b32_e32 v2, 42
309; VI-NEXT:    s_waitcnt lgkmcnt(0)
310; VI-NEXT:    s_add_u32 s2, s2, 16
311; VI-NEXT:    s_addc_u32 s3, s3, 0
312; VI-NEXT:    v_mov_b32_e32 v0, s2
313; VI-NEXT:    v_mov_b32_e32 v1, s3
314; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
315; VI-NEXT:    v_mov_b32_e32 v0, s0
316; VI-NEXT:    v_mov_b32_e32 v1, s1
317; VI-NEXT:    s_waitcnt vmcnt(0)
318; VI-NEXT:    flat_store_dword v[0:1], v2
319; VI-NEXT:    s_endpgm
320;
321; GFX9-LABEL: global_atomic_inc_ret_i32_offset:
322; GFX9:       ; %bb.0:
323; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
324; GFX9-NEXT:    v_mov_b32_e32 v0, 42
325; GFX9-NEXT:    v_mov_b32_e32 v1, 0
326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
328; GFX9-NEXT:    s_waitcnt vmcnt(0)
329; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
330; GFX9-NEXT:    s_endpgm
331;
332; GFX10-LABEL: global_atomic_inc_ret_i32_offset:
333; GFX10:       ; %bb.0:
334; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
335; GFX10-NEXT:    v_mov_b32_e32 v0, 42
336; GFX10-NEXT:    v_mov_b32_e32 v1, 0
337; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
339; GFX10-NEXT:    s_waitcnt vmcnt(0)
340; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
341; GFX10-NEXT:    s_endpgm
342  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
343  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
344  store i32 %result, i32 addrspace(1)* %out
345  ret void
346}
347
348define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
349; CI-LABEL: global_atomic_inc_noret_i32:
350; CI:       ; %bb.0:
351; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
352; CI-NEXT:    v_mov_b32_e32 v2, 42
353; CI-NEXT:    s_waitcnt lgkmcnt(0)
354; CI-NEXT:    v_mov_b32_e32 v0, s0
355; CI-NEXT:    v_mov_b32_e32 v1, s1
356; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
357; CI-NEXT:    s_endpgm
358;
359; VI-LABEL: global_atomic_inc_noret_i32:
360; VI:       ; %bb.0:
361; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
362; VI-NEXT:    v_mov_b32_e32 v2, 42
363; VI-NEXT:    s_waitcnt lgkmcnt(0)
364; VI-NEXT:    v_mov_b32_e32 v0, s0
365; VI-NEXT:    v_mov_b32_e32 v1, s1
366; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
367; VI-NEXT:    s_endpgm
368;
369; GFX9-LABEL: global_atomic_inc_noret_i32:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
372; GFX9-NEXT:    v_mov_b32_e32 v0, 42
373; GFX9-NEXT:    v_mov_b32_e32 v1, 0
374; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] glc
376; GFX9-NEXT:    s_endpgm
377;
378; GFX10-LABEL: global_atomic_inc_noret_i32:
379; GFX10:       ; %bb.0:
380; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
381; GFX10-NEXT:    v_mov_b32_e32 v0, 42
382; GFX10-NEXT:    v_mov_b32_e32 v1, 0
383; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] glc
385; GFX10-NEXT:    s_endpgm
386  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
387  ret void
388}
389
390define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
391; CI-LABEL: global_atomic_inc_noret_i32_offset:
392; CI:       ; %bb.0:
393; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
394; CI-NEXT:    v_mov_b32_e32 v2, 42
395; CI-NEXT:    s_waitcnt lgkmcnt(0)
396; CI-NEXT:    s_add_u32 s0, s0, 16
397; CI-NEXT:    s_addc_u32 s1, s1, 0
398; CI-NEXT:    v_mov_b32_e32 v0, s0
399; CI-NEXT:    v_mov_b32_e32 v1, s1
400; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
401; CI-NEXT:    s_endpgm
402;
403; VI-LABEL: global_atomic_inc_noret_i32_offset:
404; VI:       ; %bb.0:
405; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
406; VI-NEXT:    v_mov_b32_e32 v2, 42
407; VI-NEXT:    s_waitcnt lgkmcnt(0)
408; VI-NEXT:    s_add_u32 s0, s0, 16
409; VI-NEXT:    s_addc_u32 s1, s1, 0
410; VI-NEXT:    v_mov_b32_e32 v0, s0
411; VI-NEXT:    v_mov_b32_e32 v1, s1
412; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
413; VI-NEXT:    s_endpgm
414;
415; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
416; GFX9:       ; %bb.0:
417; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
418; GFX9-NEXT:    v_mov_b32_e32 v0, 42
419; GFX9-NEXT:    v_mov_b32_e32 v1, 0
420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc
422; GFX9-NEXT:    s_endpgm
423;
424; GFX10-LABEL: global_atomic_inc_noret_i32_offset:
425; GFX10:       ; %bb.0:
426; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
427; GFX10-NEXT:    v_mov_b32_e32 v0, 42
428; GFX10-NEXT:    v_mov_b32_e32 v1, 0
429; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc
431; GFX10-NEXT:    s_endpgm
432  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
433  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
434  ret void
435}
436
437define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
438; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
439; CI:       ; %bb.0:
440; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
441; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
442; CI-NEXT:    s_waitcnt lgkmcnt(0)
443; CI-NEXT:    v_mov_b32_e32 v0, s2
444; CI-NEXT:    v_mov_b32_e32 v1, s3
445; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
446; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
447; CI-NEXT:    v_mov_b32_e32 v0, s0
448; CI-NEXT:    v_mov_b32_e32 v1, s1
449; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
450; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
451; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
452; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
453; CI-NEXT:    v_mov_b32_e32 v4, 42
454; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
455; CI-NEXT:    s_waitcnt vmcnt(0)
456; CI-NEXT:    flat_store_dword v[0:1], v2
457; CI-NEXT:    s_endpgm
458;
459; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
460; VI:       ; %bb.0:
461; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
462; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
463; VI-NEXT:    s_waitcnt lgkmcnt(0)
464; VI-NEXT:    v_mov_b32_e32 v0, s2
465; VI-NEXT:    v_mov_b32_e32 v1, s3
466; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
467; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
468; VI-NEXT:    v_mov_b32_e32 v0, s0
469; VI-NEXT:    v_mov_b32_e32 v1, s1
470; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
471; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
472; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
473; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
474; VI-NEXT:    v_mov_b32_e32 v4, 42
475; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
476; VI-NEXT:    s_waitcnt vmcnt(0)
477; VI-NEXT:    flat_store_dword v[0:1], v2
478; VI-NEXT:    s_endpgm
479;
480; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
481; GFX9:       ; %bb.0:
482; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
483; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
484; GFX9-NEXT:    v_mov_b32_e32 v1, 42
485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX9-NEXT:    global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
487; GFX9-NEXT:    s_waitcnt vmcnt(0)
488; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
489; GFX9-NEXT:    s_endpgm
490;
491; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64:
492; GFX10:       ; %bb.0:
493; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
494; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
495; GFX10-NEXT:    v_mov_b32_e32 v1, 42
496; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX10-NEXT:    global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
498; GFX10-NEXT:    s_waitcnt vmcnt(0)
499; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
500; GFX10-NEXT:    s_endpgm
501  %id = call i32 @llvm.amdgcn.workitem.id.x()
502  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
503  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
504  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
505  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
506  store i32 %result, i32 addrspace(1)* %out.gep
507  ret void
508}
509
510define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
511; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
512; CI:       ; %bb.0:
513; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
514; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
515; CI-NEXT:    s_waitcnt lgkmcnt(0)
516; CI-NEXT:    v_mov_b32_e32 v0, s0
517; CI-NEXT:    v_mov_b32_e32 v1, s1
518; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
519; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
520; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
521; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
522; CI-NEXT:    v_mov_b32_e32 v2, 42
523; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
524; CI-NEXT:    s_endpgm
525;
526; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
527; VI:       ; %bb.0:
528; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
529; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
530; VI-NEXT:    s_waitcnt lgkmcnt(0)
531; VI-NEXT:    v_mov_b32_e32 v0, s0
532; VI-NEXT:    v_mov_b32_e32 v1, s1
533; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
534; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
535; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
536; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
537; VI-NEXT:    v_mov_b32_e32 v2, 42
538; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
539; VI-NEXT:    s_endpgm
540;
541; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
542; GFX9:       ; %bb.0:
543; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
544; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
545; GFX9-NEXT:    v_mov_b32_e32 v1, 42
546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
547; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc
548; GFX9-NEXT:    s_endpgm
549;
550; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64:
551; GFX10:       ; %bb.0:
552; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
553; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
554; GFX10-NEXT:    v_mov_b32_e32 v1, 42
555; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX10-NEXT:    global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc
557; GFX10-NEXT:    s_endpgm
558  %id = call i32 @llvm.amdgcn.workitem.id.x()
559  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
560  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
561  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
562  ret void
563}
564
565@lds0 = internal addrspace(3) global [512 x i32] undef, align 4
566
567define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
568; CI-LABEL: atomic_inc_shl_base_lds_0_i32:
569; CI:       ; %bb.0:
570; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
571; CI-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
572; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
573; CI-NEXT:    v_mov_b32_e32 v1, 9
574; CI-NEXT:    s_mov_b32 m0, -1
575; CI-NEXT:    ds_inc_rtn_u32 v3, v0, v1 offset:8
576; CI-NEXT:    s_waitcnt lgkmcnt(0)
577; CI-NEXT:    v_mov_b32_e32 v0, s2
578; CI-NEXT:    v_mov_b32_e32 v1, s3
579; CI-NEXT:    flat_store_dword v[0:1], v2
580; CI-NEXT:    v_mov_b32_e32 v0, s0
581; CI-NEXT:    v_mov_b32_e32 v1, s1
582; CI-NEXT:    flat_store_dword v[0:1], v3
583; CI-NEXT:    s_endpgm
584;
585; VI-LABEL: atomic_inc_shl_base_lds_0_i32:
586; VI:       ; %bb.0:
587; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
588; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
589; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
590; VI-NEXT:    v_mov_b32_e32 v1, 9
591; VI-NEXT:    s_mov_b32 m0, -1
592; VI-NEXT:    ds_inc_rtn_u32 v3, v0, v1 offset:8
593; VI-NEXT:    s_waitcnt lgkmcnt(0)
594; VI-NEXT:    v_mov_b32_e32 v0, s2
595; VI-NEXT:    v_mov_b32_e32 v1, s3
596; VI-NEXT:    flat_store_dword v[0:1], v2
597; VI-NEXT:    v_mov_b32_e32 v0, s0
598; VI-NEXT:    v_mov_b32_e32 v1, s1
599; VI-NEXT:    flat_store_dword v[0:1], v3
600; VI-NEXT:    s_endpgm
601;
602; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32:
603; GFX9:       ; %bb.0:
604; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
605; GFX9-NEXT:    v_add_u32_e32 v1, 2, v0
606; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
607; GFX9-NEXT:    v_mov_b32_e32 v2, 9
608; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v2 offset:8
609; GFX9-NEXT:    v_mov_b32_e32 v2, 0
610; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
612; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
613; GFX9-NEXT:    s_endpgm
614;
615; GFX10-LABEL: atomic_inc_shl_base_lds_0_i32:
616; GFX10:       ; %bb.0:
617; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
618; GFX10-NEXT:    v_mov_b32_e32 v2, 9
619; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
620; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
621; GFX10-NEXT:    ds_inc_rtn_u32 v1, v1, v2 offset:8
622; GFX10-NEXT:    v_mov_b32_e32 v2, 0
623; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
624; GFX10-NEXT:    global_store_dword v2, v0, s[2:3]
625; GFX10-NEXT:    global_store_dword v2, v1, s[0:1]
626; GFX10-NEXT:    s_endpgm
627  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
628  %idx.0 = add nsw i32 %tid.x, 2
629  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
630  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
631  store i32 %idx.0, i32 addrspace(1)* %add_use
632  store i32 %val0, i32 addrspace(1)* %out
633  ret void
634}
635
636define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
637; CI-LABEL: lds_atomic_inc_ret_i64:
638; CI:       ; %bb.0:
639; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
640; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
641; CI-NEXT:    v_mov_b32_e32 v0, 42
642; CI-NEXT:    v_mov_b32_e32 v1, 0
643; CI-NEXT:    s_mov_b32 m0, -1
644; CI-NEXT:    s_waitcnt lgkmcnt(0)
645; CI-NEXT:    v_mov_b32_e32 v2, s2
646; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
647; CI-NEXT:    v_mov_b32_e32 v3, s1
648; CI-NEXT:    v_mov_b32_e32 v2, s0
649; CI-NEXT:    s_waitcnt lgkmcnt(0)
650; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
651; CI-NEXT:    s_endpgm
652;
653; VI-LABEL: lds_atomic_inc_ret_i64:
654; VI:       ; %bb.0:
655; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
656; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
657; VI-NEXT:    v_mov_b32_e32 v0, 42
658; VI-NEXT:    v_mov_b32_e32 v1, 0
659; VI-NEXT:    s_mov_b32 m0, -1
660; VI-NEXT:    s_waitcnt lgkmcnt(0)
661; VI-NEXT:    v_mov_b32_e32 v2, s2
662; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
663; VI-NEXT:    v_mov_b32_e32 v3, s1
664; VI-NEXT:    v_mov_b32_e32 v2, s0
665; VI-NEXT:    s_waitcnt lgkmcnt(0)
666; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
667; VI-NEXT:    s_endpgm
668;
669; GFX9-LABEL: lds_atomic_inc_ret_i64:
670; GFX9:       ; %bb.0:
671; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
672; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
673; GFX9-NEXT:    v_mov_b32_e32 v0, 42
674; GFX9-NEXT:    v_mov_b32_e32 v1, 0
675; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX9-NEXT:    v_mov_b32_e32 v2, s2
677; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
678; GFX9-NEXT:    v_mov_b32_e32 v2, 0
679; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
681; GFX9-NEXT:    s_endpgm
682;
683; GFX10-LABEL: lds_atomic_inc_ret_i64:
684; GFX10:       ; %bb.0:
685; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
686; GFX10-NEXT:    v_mov_b32_e32 v0, 42
687; GFX10-NEXT:    v_mov_b32_e32 v1, 0
688; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX10-NEXT:    v_mov_b32_e32 v2, s0
690; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
691; GFX10-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
692; GFX10-NEXT:    v_mov_b32_e32 v2, 0
693; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
695; GFX10-NEXT:    s_endpgm
696  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
697  store i64 %result, i64 addrspace(1)* %out
698  ret void
699}
700
701define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
702; CI-LABEL: lds_atomic_inc_ret_i64_offset:
703; CI:       ; %bb.0:
704; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
705; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
706; CI-NEXT:    v_mov_b32_e32 v0, 42
707; CI-NEXT:    v_mov_b32_e32 v1, 0
708; CI-NEXT:    s_mov_b32 m0, -1
709; CI-NEXT:    s_waitcnt lgkmcnt(0)
710; CI-NEXT:    v_mov_b32_e32 v2, s2
711; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
712; CI-NEXT:    v_mov_b32_e32 v3, s1
713; CI-NEXT:    v_mov_b32_e32 v2, s0
714; CI-NEXT:    s_waitcnt lgkmcnt(0)
715; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
716; CI-NEXT:    s_endpgm
717;
718; VI-LABEL: lds_atomic_inc_ret_i64_offset:
719; VI:       ; %bb.0:
720; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
721; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
722; VI-NEXT:    v_mov_b32_e32 v0, 42
723; VI-NEXT:    v_mov_b32_e32 v1, 0
724; VI-NEXT:    s_mov_b32 m0, -1
725; VI-NEXT:    s_waitcnt lgkmcnt(0)
726; VI-NEXT:    v_mov_b32_e32 v2, s2
727; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
728; VI-NEXT:    v_mov_b32_e32 v3, s1
729; VI-NEXT:    v_mov_b32_e32 v2, s0
730; VI-NEXT:    s_waitcnt lgkmcnt(0)
731; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
732; VI-NEXT:    s_endpgm
733;
734; GFX9-LABEL: lds_atomic_inc_ret_i64_offset:
735; GFX9:       ; %bb.0:
736; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
737; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
738; GFX9-NEXT:    v_mov_b32_e32 v0, 42
739; GFX9-NEXT:    v_mov_b32_e32 v1, 0
740; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX9-NEXT:    v_mov_b32_e32 v2, s2
742; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
743; GFX9-NEXT:    v_mov_b32_e32 v2, 0
744; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
746; GFX9-NEXT:    s_endpgm
747;
748; GFX10-LABEL: lds_atomic_inc_ret_i64_offset:
749; GFX10:       ; %bb.0:
750; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
751; GFX10-NEXT:    v_mov_b32_e32 v0, 42
752; GFX10-NEXT:    v_mov_b32_e32 v1, 0
753; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX10-NEXT:    v_mov_b32_e32 v2, s0
755; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
756; GFX10-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
757; GFX10-NEXT:    v_mov_b32_e32 v2, 0
758; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
760; GFX10-NEXT:    s_endpgm
761  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
762  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
763  store i64 %result, i64 addrspace(1)* %out
764  ret void
765}
766
767define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
768; CI-LABEL: lds_atomic_inc_noret_i64:
769; CI:       ; %bb.0:
770; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
771; CI-NEXT:    v_mov_b32_e32 v0, 42
772; CI-NEXT:    v_mov_b32_e32 v1, 0
773; CI-NEXT:    s_mov_b32 m0, -1
774; CI-NEXT:    s_waitcnt lgkmcnt(0)
775; CI-NEXT:    v_mov_b32_e32 v2, s0
776; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
777; CI-NEXT:    s_endpgm
778;
779; VI-LABEL: lds_atomic_inc_noret_i64:
780; VI:       ; %bb.0:
781; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
782; VI-NEXT:    v_mov_b32_e32 v0, 42
783; VI-NEXT:    v_mov_b32_e32 v1, 0
784; VI-NEXT:    s_mov_b32 m0, -1
785; VI-NEXT:    s_waitcnt lgkmcnt(0)
786; VI-NEXT:    v_mov_b32_e32 v2, s0
787; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
788; VI-NEXT:    s_endpgm
789;
790; GFX9-LABEL: lds_atomic_inc_noret_i64:
791; GFX9:       ; %bb.0:
792; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
793; GFX9-NEXT:    v_mov_b32_e32 v0, 42
794; GFX9-NEXT:    v_mov_b32_e32 v1, 0
795; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX9-NEXT:    v_mov_b32_e32 v2, s0
797; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
798; GFX9-NEXT:    s_endpgm
799;
800; GFX10-LABEL: lds_atomic_inc_noret_i64:
801; GFX10:       ; %bb.0:
802; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
803; GFX10-NEXT:    v_mov_b32_e32 v0, 42
804; GFX10-NEXT:    v_mov_b32_e32 v1, 0
805; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX10-NEXT:    v_mov_b32_e32 v2, s0
807; GFX10-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
808; GFX10-NEXT:    s_endpgm
809  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
810  ret void
811}
812
813define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
814; CI-LABEL: lds_atomic_inc_noret_i64_offset:
815; CI:       ; %bb.0:
816; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
817; CI-NEXT:    v_mov_b32_e32 v0, 42
818; CI-NEXT:    v_mov_b32_e32 v1, 0
819; CI-NEXT:    s_mov_b32 m0, -1
820; CI-NEXT:    s_waitcnt lgkmcnt(0)
821; CI-NEXT:    v_mov_b32_e32 v2, s0
822; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
823; CI-NEXT:    s_endpgm
824;
825; VI-LABEL: lds_atomic_inc_noret_i64_offset:
826; VI:       ; %bb.0:
827; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
828; VI-NEXT:    v_mov_b32_e32 v0, 42
829; VI-NEXT:    v_mov_b32_e32 v1, 0
830; VI-NEXT:    s_mov_b32 m0, -1
831; VI-NEXT:    s_waitcnt lgkmcnt(0)
832; VI-NEXT:    v_mov_b32_e32 v2, s0
833; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
834; VI-NEXT:    s_endpgm
835;
836; GFX9-LABEL: lds_atomic_inc_noret_i64_offset:
837; GFX9:       ; %bb.0:
838; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
839; GFX9-NEXT:    v_mov_b32_e32 v0, 42
840; GFX9-NEXT:    v_mov_b32_e32 v1, 0
841; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX9-NEXT:    v_mov_b32_e32 v2, s0
843; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
844; GFX9-NEXT:    s_endpgm
845;
846; GFX10-LABEL: lds_atomic_inc_noret_i64_offset:
847; GFX10:       ; %bb.0:
848; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
849; GFX10-NEXT:    v_mov_b32_e32 v0, 42
850; GFX10-NEXT:    v_mov_b32_e32 v1, 0
851; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX10-NEXT:    v_mov_b32_e32 v2, s0
853; GFX10-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
854; GFX10-NEXT:    s_endpgm
855  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
856  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
857  ret void
858}
859
860define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
861; CI-LABEL: global_atomic_inc_ret_i64:
862; CI:       ; %bb.0:
863; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
864; CI-NEXT:    v_mov_b32_e32 v0, 42
865; CI-NEXT:    v_mov_b32_e32 v1, 0
866; CI-NEXT:    s_waitcnt lgkmcnt(0)
867; CI-NEXT:    v_mov_b32_e32 v2, s2
868; CI-NEXT:    v_mov_b32_e32 v3, s3
869; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
870; CI-NEXT:    v_mov_b32_e32 v3, s1
871; CI-NEXT:    v_mov_b32_e32 v2, s0
872; CI-NEXT:    s_waitcnt vmcnt(0)
873; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
874; CI-NEXT:    s_endpgm
875;
876; VI-LABEL: global_atomic_inc_ret_i64:
877; VI:       ; %bb.0:
878; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
879; VI-NEXT:    v_mov_b32_e32 v0, 42
880; VI-NEXT:    v_mov_b32_e32 v1, 0
881; VI-NEXT:    s_waitcnt lgkmcnt(0)
882; VI-NEXT:    v_mov_b32_e32 v2, s2
883; VI-NEXT:    v_mov_b32_e32 v3, s3
884; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
885; VI-NEXT:    v_mov_b32_e32 v3, s1
886; VI-NEXT:    v_mov_b32_e32 v2, s0
887; VI-NEXT:    s_waitcnt vmcnt(0)
888; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
889; VI-NEXT:    s_endpgm
890;
891; GFX9-LABEL: global_atomic_inc_ret_i64:
892; GFX9:       ; %bb.0:
893; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
894; GFX9-NEXT:    v_mov_b32_e32 v0, 42
895; GFX9-NEXT:    v_mov_b32_e32 v1, 0
896; GFX9-NEXT:    v_mov_b32_e32 v2, 0
897; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
899; GFX9-NEXT:    s_waitcnt vmcnt(0)
900; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
901; GFX9-NEXT:    s_endpgm
902;
903; GFX10-LABEL: global_atomic_inc_ret_i64:
904; GFX10:       ; %bb.0:
905; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
906; GFX10-NEXT:    v_mov_b32_e32 v0, 42
907; GFX10-NEXT:    v_mov_b32_e32 v1, 0
908; GFX10-NEXT:    v_mov_b32_e32 v2, 0
909; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
911; GFX10-NEXT:    s_waitcnt vmcnt(0)
912; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
913; GFX10-NEXT:    s_endpgm
914  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
915  store i64 %result, i64 addrspace(1)* %out
916  ret void
917}
918
919define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
920; CI-LABEL: global_atomic_inc_ret_i64_offset:
921; CI:       ; %bb.0:
922; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
923; CI-NEXT:    v_mov_b32_e32 v0, 42
924; CI-NEXT:    v_mov_b32_e32 v1, 0
925; CI-NEXT:    s_waitcnt lgkmcnt(0)
926; CI-NEXT:    s_add_u32 s2, s2, 32
927; CI-NEXT:    s_addc_u32 s3, s3, 0
928; CI-NEXT:    v_mov_b32_e32 v2, s2
929; CI-NEXT:    v_mov_b32_e32 v3, s3
930; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
931; CI-NEXT:    v_mov_b32_e32 v3, s1
932; CI-NEXT:    v_mov_b32_e32 v2, s0
933; CI-NEXT:    s_waitcnt vmcnt(0)
934; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
935; CI-NEXT:    s_endpgm
936;
937; VI-LABEL: global_atomic_inc_ret_i64_offset:
938; VI:       ; %bb.0:
939; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
940; VI-NEXT:    v_mov_b32_e32 v0, 42
941; VI-NEXT:    v_mov_b32_e32 v1, 0
942; VI-NEXT:    s_waitcnt lgkmcnt(0)
943; VI-NEXT:    s_add_u32 s2, s2, 32
944; VI-NEXT:    s_addc_u32 s3, s3, 0
945; VI-NEXT:    v_mov_b32_e32 v2, s2
946; VI-NEXT:    v_mov_b32_e32 v3, s3
947; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
948; VI-NEXT:    v_mov_b32_e32 v3, s1
949; VI-NEXT:    v_mov_b32_e32 v2, s0
950; VI-NEXT:    s_waitcnt vmcnt(0)
951; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
952; VI-NEXT:    s_endpgm
953;
954; GFX9-LABEL: global_atomic_inc_ret_i64_offset:
955; GFX9:       ; %bb.0:
956; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
957; GFX9-NEXT:    v_mov_b32_e32 v0, 42
958; GFX9-NEXT:    v_mov_b32_e32 v1, 0
959; GFX9-NEXT:    v_mov_b32_e32 v2, 0
960; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
961; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
962; GFX9-NEXT:    s_waitcnt vmcnt(0)
963; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
964; GFX9-NEXT:    s_endpgm
965;
966; GFX10-LABEL: global_atomic_inc_ret_i64_offset:
967; GFX10:       ; %bb.0:
968; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
969; GFX10-NEXT:    v_mov_b32_e32 v0, 42
970; GFX10-NEXT:    v_mov_b32_e32 v1, 0
971; GFX10-NEXT:    v_mov_b32_e32 v2, 0
972; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
974; GFX10-NEXT:    s_waitcnt vmcnt(0)
975; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
976; GFX10-NEXT:    s_endpgm
977  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
978  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
979  store i64 %result, i64 addrspace(1)* %out
980  ret void
981}
982
983define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
984; CI-LABEL: global_atomic_inc_noret_i64:
985; CI:       ; %bb.0:
986; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
987; CI-NEXT:    v_mov_b32_e32 v0, 42
988; CI-NEXT:    v_mov_b32_e32 v1, 0
989; CI-NEXT:    s_waitcnt lgkmcnt(0)
990; CI-NEXT:    v_mov_b32_e32 v3, s1
991; CI-NEXT:    v_mov_b32_e32 v2, s0
992; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
993; CI-NEXT:    s_endpgm
994;
995; VI-LABEL: global_atomic_inc_noret_i64:
996; VI:       ; %bb.0:
997; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
998; VI-NEXT:    v_mov_b32_e32 v0, 42
999; VI-NEXT:    v_mov_b32_e32 v1, 0
1000; VI-NEXT:    s_waitcnt lgkmcnt(0)
1001; VI-NEXT:    v_mov_b32_e32 v3, s1
1002; VI-NEXT:    v_mov_b32_e32 v2, s0
1003; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1004; VI-NEXT:    s_endpgm
1005;
1006; GFX9-LABEL: global_atomic_inc_noret_i64:
1007; GFX9:       ; %bb.0:
1008; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1009; GFX9-NEXT:    v_mov_b32_e32 v0, 42
1010; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1011; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1012; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc
1014; GFX9-NEXT:    s_endpgm
1015;
1016; GFX10-LABEL: global_atomic_inc_noret_i64:
1017; GFX10:       ; %bb.0:
1018; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1019; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1020; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1021; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1022; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc
1024; GFX10-NEXT:    s_endpgm
1025  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
1026  ret void
1027}
1028
1029define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
1030; CI-LABEL: global_atomic_inc_noret_i64_offset:
1031; CI:       ; %bb.0:
1032; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1033; CI-NEXT:    v_mov_b32_e32 v0, 42
1034; CI-NEXT:    v_mov_b32_e32 v1, 0
1035; CI-NEXT:    s_waitcnt lgkmcnt(0)
1036; CI-NEXT:    s_add_u32 s0, s0, 32
1037; CI-NEXT:    s_addc_u32 s1, s1, 0
1038; CI-NEXT:    v_mov_b32_e32 v3, s1
1039; CI-NEXT:    v_mov_b32_e32 v2, s0
1040; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1041; CI-NEXT:    s_endpgm
1042;
1043; VI-LABEL: global_atomic_inc_noret_i64_offset:
1044; VI:       ; %bb.0:
1045; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1046; VI-NEXT:    v_mov_b32_e32 v0, 42
1047; VI-NEXT:    v_mov_b32_e32 v1, 0
1048; VI-NEXT:    s_waitcnt lgkmcnt(0)
1049; VI-NEXT:    s_add_u32 s0, s0, 32
1050; VI-NEXT:    s_addc_u32 s1, s1, 0
1051; VI-NEXT:    v_mov_b32_e32 v3, s1
1052; VI-NEXT:    v_mov_b32_e32 v2, s0
1053; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1054; VI-NEXT:    s_endpgm
1055;
1056; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
1057; GFX9:       ; %bb.0:
1058; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1059; GFX9-NEXT:    v_mov_b32_e32 v0, 42
1060; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1061; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1062; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1063; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
1064; GFX9-NEXT:    s_endpgm
1065;
1066; GFX10-LABEL: global_atomic_inc_noret_i64_offset:
1067; GFX10:       ; %bb.0:
1068; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1069; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1070; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1071; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1072; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1073; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
1074; GFX10-NEXT:    s_endpgm
1075  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
1076  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
1077  ret void
1078}
1079
1080define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
1081; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64:
1082; CI:       ; %bb.0:
1083; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1084; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1085; CI-NEXT:    s_waitcnt lgkmcnt(0)
1086; CI-NEXT:    v_mov_b32_e32 v0, s2
1087; CI-NEXT:    v_mov_b32_e32 v1, s3
1088; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
1089; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1090; CI-NEXT:    v_mov_b32_e32 v0, s0
1091; CI-NEXT:    v_mov_b32_e32 v1, s1
1092; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1093; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1094; CI-NEXT:    v_mov_b32_e32 v2, 42
1095; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
1096; CI-NEXT:    v_mov_b32_e32 v3, 0
1097; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1098; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
1099; CI-NEXT:    s_waitcnt vmcnt(0)
1100; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1101; CI-NEXT:    s_endpgm
1102;
1103; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64:
1104; VI:       ; %bb.0:
1105; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1106; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1107; VI-NEXT:    s_waitcnt lgkmcnt(0)
1108; VI-NEXT:    v_mov_b32_e32 v0, s2
1109; VI-NEXT:    v_mov_b32_e32 v1, s3
1110; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
1111; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1112; VI-NEXT:    v_mov_b32_e32 v0, s0
1113; VI-NEXT:    v_mov_b32_e32 v1, s1
1114; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1115; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1116; VI-NEXT:    v_mov_b32_e32 v2, 42
1117; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
1118; VI-NEXT:    v_mov_b32_e32 v3, 0
1119; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1120; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
1121; VI-NEXT:    s_waitcnt vmcnt(0)
1122; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1123; VI-NEXT:    s_endpgm
1124;
1125; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64:
1126; GFX9:       ; %bb.0:
1127; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1128; GFX9-NEXT:    v_mov_b32_e32 v1, 42
1129; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1130; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1131; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
1133; GFX9-NEXT:    s_waitcnt vmcnt(0)
1134; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
1135; GFX9-NEXT:    s_endpgm
1136;
1137; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64:
1138; GFX10:       ; %bb.0:
1139; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1140; GFX10-NEXT:    v_mov_b32_e32 v1, 42
1141; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1142; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1143; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
1145; GFX10-NEXT:    s_waitcnt vmcnt(0)
1146; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
1147; GFX10-NEXT:    s_endpgm
1148  %id = call i32 @llvm.amdgcn.workitem.id.x()
1149  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
1150  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
1151  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
1152  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
1153  store i64 %result, i64 addrspace(1)* %out.gep
1154  ret void
1155}
1156
1157define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
1158; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
1159; CI:       ; %bb.0:
1160; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1161; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1162; CI-NEXT:    s_waitcnt lgkmcnt(0)
1163; CI-NEXT:    v_mov_b32_e32 v0, s0
1164; CI-NEXT:    v_mov_b32_e32 v1, s1
1165; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
1166; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1167; CI-NEXT:    v_mov_b32_e32 v0, 42
1168; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
1169; CI-NEXT:    v_mov_b32_e32 v1, 0
1170; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1171; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1172; CI-NEXT:    s_endpgm
1173;
1174; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
1175; VI:       ; %bb.0:
1176; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1177; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1178; VI-NEXT:    s_waitcnt lgkmcnt(0)
1179; VI-NEXT:    v_mov_b32_e32 v0, s0
1180; VI-NEXT:    v_mov_b32_e32 v1, s1
1181; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
1182; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1183; VI-NEXT:    v_mov_b32_e32 v0, 42
1184; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
1185; VI-NEXT:    v_mov_b32_e32 v1, 0
1186; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1187; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1188; VI-NEXT:    s_endpgm
1189;
1190; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64:
1191; GFX9:       ; %bb.0:
1192; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1193; GFX9-NEXT:    v_mov_b32_e32 v1, 42
1194; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1195; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1196; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1197; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc
1198; GFX9-NEXT:    s_endpgm
1199;
1200; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64:
1201; GFX10:       ; %bb.0:
1202; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1203; GFX10-NEXT:    v_mov_b32_e32 v1, 42
1204; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1205; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc
1208; GFX10-NEXT:    s_endpgm
1209  %id = call i32 @llvm.amdgcn.workitem.id.x()
1210  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
1211  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
1212  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
1213  ret void
1214}
1215
1216define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 {
1217; GCN-LABEL: flat_atomic_inc_ret_i32:
1218; GCN:       ; %bb.0:
1219; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1220; GCN-NEXT:    v_mov_b32_e32 v2, 42
1221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1222; GCN-NEXT:    v_mov_b32_e32 v0, s2
1223; GCN-NEXT:    v_mov_b32_e32 v1, s3
1224; GCN-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
1225; GCN-NEXT:    v_mov_b32_e32 v0, s0
1226; GCN-NEXT:    v_mov_b32_e32 v1, s1
1227; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1228; GCN-NEXT:    flat_store_dword v[0:1], v2
1229; GCN-NEXT:    s_endpgm
1230  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
1231  store i32 %result, i32* %out
1232  ret void
1233}
1234
1235define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 {
1236; CI-LABEL: flat_atomic_inc_ret_i32_offset:
1237; CI:       ; %bb.0:
1238; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1239; CI-NEXT:    v_mov_b32_e32 v2, 42
1240; CI-NEXT:    s_waitcnt lgkmcnt(0)
1241; CI-NEXT:    s_add_u32 s2, s2, 16
1242; CI-NEXT:    s_addc_u32 s3, s3, 0
1243; CI-NEXT:    v_mov_b32_e32 v0, s2
1244; CI-NEXT:    v_mov_b32_e32 v1, s3
1245; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
1246; CI-NEXT:    v_mov_b32_e32 v0, s0
1247; CI-NEXT:    v_mov_b32_e32 v1, s1
1248; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1249; CI-NEXT:    flat_store_dword v[0:1], v2
1250; CI-NEXT:    s_endpgm
1251;
1252; VI-LABEL: flat_atomic_inc_ret_i32_offset:
1253; VI:       ; %bb.0:
1254; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1255; VI-NEXT:    v_mov_b32_e32 v2, 42
1256; VI-NEXT:    s_waitcnt lgkmcnt(0)
1257; VI-NEXT:    s_add_u32 s2, s2, 16
1258; VI-NEXT:    s_addc_u32 s3, s3, 0
1259; VI-NEXT:    v_mov_b32_e32 v0, s2
1260; VI-NEXT:    v_mov_b32_e32 v1, s3
1261; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
1262; VI-NEXT:    v_mov_b32_e32 v0, s0
1263; VI-NEXT:    v_mov_b32_e32 v1, s1
1264; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1265; VI-NEXT:    flat_store_dword v[0:1], v2
1266; VI-NEXT:    s_endpgm
1267;
1268; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
1269; GFX9:       ; %bb.0:
1270; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1271; GFX9-NEXT:    v_mov_b32_e32 v2, 42
1272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1274; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1275; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
1276; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1277; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1278; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1279; GFX9-NEXT:    flat_store_dword v[0:1], v2
1280; GFX9-NEXT:    s_endpgm
1281;
1282; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
1283; GFX10:       ; %bb.0:
1284; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1285; GFX10-NEXT:    v_mov_b32_e32 v2, 42
1286; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX10-NEXT:    s_add_u32 s2, s2, 16
1288; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1289; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1290; GFX10-NEXT:    v_mov_b32_e32 v1, s3
1291; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
1292; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1293; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1294; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1295; GFX10-NEXT:    flat_store_dword v[0:1], v2
1296; GFX10-NEXT:    s_endpgm
1297  %gep = getelementptr i32, i32* %ptr, i32 4
1298  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
1299  store i32 %result, i32* %out
1300  ret void
1301}
1302
1303define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind {
1304; GCN-LABEL: flat_atomic_inc_noret_i32:
1305; GCN:       ; %bb.0:
1306; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1307; GCN-NEXT:    v_mov_b32_e32 v2, 42
1308; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1309; GCN-NEXT:    v_mov_b32_e32 v0, s0
1310; GCN-NEXT:    v_mov_b32_e32 v1, s1
1311; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1312; GCN-NEXT:    s_endpgm
1313  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
1314  ret void
1315}
1316
1317define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind {
1318; CI-LABEL: flat_atomic_inc_noret_i32_offset:
1319; CI:       ; %bb.0:
1320; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1321; CI-NEXT:    v_mov_b32_e32 v2, 42
1322; CI-NEXT:    s_waitcnt lgkmcnt(0)
1323; CI-NEXT:    s_add_u32 s0, s0, 16
1324; CI-NEXT:    s_addc_u32 s1, s1, 0
1325; CI-NEXT:    v_mov_b32_e32 v0, s0
1326; CI-NEXT:    v_mov_b32_e32 v1, s1
1327; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1328; CI-NEXT:    s_endpgm
1329;
1330; VI-LABEL: flat_atomic_inc_noret_i32_offset:
1331; VI:       ; %bb.0:
1332; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1333; VI-NEXT:    v_mov_b32_e32 v2, 42
1334; VI-NEXT:    s_waitcnt lgkmcnt(0)
1335; VI-NEXT:    s_add_u32 s0, s0, 16
1336; VI-NEXT:    s_addc_u32 s1, s1, 0
1337; VI-NEXT:    v_mov_b32_e32 v0, s0
1338; VI-NEXT:    v_mov_b32_e32 v1, s1
1339; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1340; VI-NEXT:    s_endpgm
1341;
1342; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
1343; GFX9:       ; %bb.0:
1344; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1345; GFX9-NEXT:    v_mov_b32_e32 v2, 42
1346; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1347; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1348; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1349; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
1350; GFX9-NEXT:    s_endpgm
1351;
1352; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
1353; GFX10:       ; %bb.0:
1354; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1355; GFX10-NEXT:    v_mov_b32_e32 v2, 42
1356; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX10-NEXT:    s_add_u32 s0, s0, 16
1358; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1359; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1360; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1361; GFX10-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1362; GFX10-NEXT:    s_endpgm
1363  %gep = getelementptr i32, i32* %ptr, i32 4
1364  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
1365  ret void
1366}
1367
1368define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 {
1369; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
1370; CI:       ; %bb.0:
1371; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1372; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1373; CI-NEXT:    s_waitcnt lgkmcnt(0)
1374; CI-NEXT:    v_mov_b32_e32 v0, s2
1375; CI-NEXT:    v_mov_b32_e32 v1, s3
1376; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
1377; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1378; CI-NEXT:    v_mov_b32_e32 v0, s0
1379; CI-NEXT:    v_mov_b32_e32 v1, s1
1380; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1381; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1382; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
1383; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1384; CI-NEXT:    v_mov_b32_e32 v4, 42
1385; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
1386; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1387; CI-NEXT:    flat_store_dword v[0:1], v2
1388; CI-NEXT:    s_endpgm
1389;
1390; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
1391; VI:       ; %bb.0:
1392; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1393; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1394; VI-NEXT:    s_waitcnt lgkmcnt(0)
1395; VI-NEXT:    v_mov_b32_e32 v0, s2
1396; VI-NEXT:    v_mov_b32_e32 v1, s3
1397; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
1398; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1399; VI-NEXT:    v_mov_b32_e32 v0, s0
1400; VI-NEXT:    v_mov_b32_e32 v1, s1
1401; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1402; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1403; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
1404; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1405; VI-NEXT:    v_mov_b32_e32 v4, 42
1406; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
1407; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1408; VI-NEXT:    flat_store_dword v[0:1], v2
1409; VI-NEXT:    s_endpgm
1410;
1411; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
1412; GFX9:       ; %bb.0:
1413; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1414; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1417; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1418; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1419; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1420; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1421; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1422; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1423; GFX9-NEXT:    v_mov_b32_e32 v4, 42
1424; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v4 offset:20 glc
1425; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1426; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1427; GFX9-NEXT:    flat_store_dword v[2:3], v0
1428; GFX9-NEXT:    s_endpgm
1429;
1430; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
1431; GFX10:       ; %bb.0:
1432; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1433; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1434; GFX10-NEXT:    v_mov_b32_e32 v3, 42
1435; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1437; GFX10-NEXT:    v_mov_b32_e32 v1, s3
1438; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
1439; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1440; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 20
1441; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1442; GFX10-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
1443; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1444; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1445; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
1446; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1447; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1448; GFX10-NEXT:    flat_store_dword v[0:1], v3
1449; GFX10-NEXT:    s_endpgm
1450  %id = call i32 @llvm.amdgcn.workitem.id.x()
1451  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
1452  %out.gep = getelementptr i32, i32* %out, i32 %id
1453  %gep = getelementptr i32, i32* %gep.tid, i32 5
1454  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
1455  store i32 %result, i32* %out.gep
1456  ret void
1457}
1458
1459define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 {
1460; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
1461; CI:       ; %bb.0:
1462; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1463; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1464; CI-NEXT:    s_waitcnt lgkmcnt(0)
1465; CI-NEXT:    v_mov_b32_e32 v0, s0
1466; CI-NEXT:    v_mov_b32_e32 v1, s1
1467; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1468; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1469; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
1470; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1471; CI-NEXT:    v_mov_b32_e32 v2, 42
1472; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1473; CI-NEXT:    s_endpgm
1474;
1475; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
1476; VI:       ; %bb.0:
1477; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1478; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1479; VI-NEXT:    s_waitcnt lgkmcnt(0)
1480; VI-NEXT:    v_mov_b32_e32 v0, s0
1481; VI-NEXT:    v_mov_b32_e32 v1, s1
1482; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1483; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1484; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
1485; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1486; VI-NEXT:    v_mov_b32_e32 v2, 42
1487; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1488; VI-NEXT:    s_endpgm
1489;
1490; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
1491; GFX9:       ; %bb.0:
1492; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1493; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1494; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1496; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1497; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1498; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1499; GFX9-NEXT:    v_mov_b32_e32 v2, 42
1500; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:20 glc
1501; GFX9-NEXT:    s_endpgm
1502;
1503; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
1504; GFX10:       ; %bb.0:
1505; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1506; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1507; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1509; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1510; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
1511; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1512; GFX10-NEXT:    v_mov_b32_e32 v2, 42
1513; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 20
1514; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1515; GFX10-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
1516; GFX10-NEXT:    s_endpgm
1517  %id = call i32 @llvm.amdgcn.workitem.id.x()
1518  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
1519  %gep = getelementptr i32, i32* %gep.tid, i32 5
1520  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
1521  ret void
1522}
1523
1524@lds1 = internal addrspace(3) global [512 x i64] undef, align 8
1525
1526define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
1527; CI-LABEL: atomic_inc_shl_base_lds_0_i64:
1528; CI:       ; %bb.0:
1529; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1530; CI-NEXT:    v_add_i32_e32 v4, vcc, 2, v0
1531; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1532; CI-NEXT:    v_mov_b32_e32 v0, 9
1533; CI-NEXT:    v_mov_b32_e32 v1, 0
1534; CI-NEXT:    s_mov_b32 m0, -1
1535; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16
1536; CI-NEXT:    s_waitcnt lgkmcnt(0)
1537; CI-NEXT:    v_mov_b32_e32 v2, s2
1538; CI-NEXT:    v_mov_b32_e32 v3, s3
1539; CI-NEXT:    flat_store_dword v[2:3], v4
1540; CI-NEXT:    v_mov_b32_e32 v3, s1
1541; CI-NEXT:    v_mov_b32_e32 v2, s0
1542; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1543; CI-NEXT:    s_endpgm
1544;
1545; VI-LABEL: atomic_inc_shl_base_lds_0_i64:
1546; VI:       ; %bb.0:
1547; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1548; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
1549; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1550; VI-NEXT:    v_mov_b32_e32 v0, 9
1551; VI-NEXT:    v_mov_b32_e32 v1, 0
1552; VI-NEXT:    s_mov_b32 m0, -1
1553; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16
1554; VI-NEXT:    s_waitcnt lgkmcnt(0)
1555; VI-NEXT:    v_mov_b32_e32 v2, s2
1556; VI-NEXT:    v_mov_b32_e32 v3, s3
1557; VI-NEXT:    flat_store_dword v[2:3], v4
1558; VI-NEXT:    v_mov_b32_e32 v3, s1
1559; VI-NEXT:    v_mov_b32_e32 v2, s0
1560; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1561; VI-NEXT:    s_endpgm
1562;
1563; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
1564; GFX9:       ; %bb.0:
1565; GFX9-NEXT:    v_mov_b32_e32 v1, 9
1566; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1567; GFX9-NEXT:    v_add_u32_e32 v3, 2, v0
1568; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1569; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1570; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16
1571; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX9-NEXT:    global_store_dword v2, v3, s[2:3]
1574; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1575; GFX9-NEXT:    s_endpgm
1576;
1577; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64:
1578; GFX10:       ; %bb.0:
1579; GFX10-NEXT:    v_mov_b32_e32 v1, 9
1580; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1581; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1582; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1583; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
1584; GFX10-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
1585; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1586; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1587; GFX10-NEXT:    global_store_dword v3, v0, s[2:3]
1588; GFX10-NEXT:    global_store_dwordx2 v3, v[1:2], s[0:1]
1589; GFX10-NEXT:    s_endpgm
1590  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1591  %idx.0 = add nsw i32 %tid.x, 2
1592  %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
1593  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
1594  store i32 %idx.0, i32 addrspace(1)* %add_use
1595  store i64 %val0, i64 addrspace(1)* %out
1596  ret void
1597}
1598
1599define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
1600; GCN-LABEL: flat_atomic_inc_ret_i64:
1601; GCN:       ; %bb.0:
1602; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1603; GCN-NEXT:    v_mov_b32_e32 v0, 42
1604; GCN-NEXT:    v_mov_b32_e32 v1, 0
1605; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1606; GCN-NEXT:    v_mov_b32_e32 v2, s2
1607; GCN-NEXT:    v_mov_b32_e32 v3, s3
1608; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1609; GCN-NEXT:    v_mov_b32_e32 v3, s1
1610; GCN-NEXT:    v_mov_b32_e32 v2, s0
1611; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1612; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1613; GCN-NEXT:    s_endpgm
1614  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
1615  store i64 %result, i64* %out
1616  ret void
1617}
1618
1619define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 {
1620; CI-LABEL: flat_atomic_inc_ret_i64_offset:
1621; CI:       ; %bb.0:
1622; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1623; CI-NEXT:    v_mov_b32_e32 v0, 42
1624; CI-NEXT:    v_mov_b32_e32 v1, 0
1625; CI-NEXT:    s_waitcnt lgkmcnt(0)
1626; CI-NEXT:    s_add_u32 s2, s2, 32
1627; CI-NEXT:    s_addc_u32 s3, s3, 0
1628; CI-NEXT:    v_mov_b32_e32 v2, s2
1629; CI-NEXT:    v_mov_b32_e32 v3, s3
1630; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1631; CI-NEXT:    v_mov_b32_e32 v3, s1
1632; CI-NEXT:    v_mov_b32_e32 v2, s0
1633; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1634; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1635; CI-NEXT:    s_endpgm
1636;
1637; VI-LABEL: flat_atomic_inc_ret_i64_offset:
1638; VI:       ; %bb.0:
1639; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1640; VI-NEXT:    v_mov_b32_e32 v0, 42
1641; VI-NEXT:    v_mov_b32_e32 v1, 0
1642; VI-NEXT:    s_waitcnt lgkmcnt(0)
1643; VI-NEXT:    s_add_u32 s2, s2, 32
1644; VI-NEXT:    s_addc_u32 s3, s3, 0
1645; VI-NEXT:    v_mov_b32_e32 v2, s2
1646; VI-NEXT:    v_mov_b32_e32 v3, s3
1647; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1648; VI-NEXT:    v_mov_b32_e32 v3, s1
1649; VI-NEXT:    v_mov_b32_e32 v2, s0
1650; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1651; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1652; VI-NEXT:    s_endpgm
1653;
1654; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
1655; GFX9:       ; %bb.0:
1656; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1657; GFX9-NEXT:    v_mov_b32_e32 v0, 42
1658; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1659; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1660; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1661; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1662; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
1663; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1664; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1665; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1666; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1667; GFX9-NEXT:    s_endpgm
1668;
1669; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
1670; GFX10:       ; %bb.0:
1671; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1672; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1673; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1674; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1675; GFX10-NEXT:    s_add_u32 s2, s2, 32
1676; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1677; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1678; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1679; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1680; GFX10-NEXT:    v_mov_b32_e32 v3, s1
1681; GFX10-NEXT:    v_mov_b32_e32 v2, s0
1682; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1683; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1684; GFX10-NEXT:    s_endpgm
1685  %gep = getelementptr i64, i64* %ptr, i32 4
1686  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
1687  store i64 %result, i64* %out
1688  ret void
1689}
1690
1691define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
1692; GCN-LABEL: flat_atomic_inc_noret_i64:
1693; GCN:       ; %bb.0:
1694; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1695; GCN-NEXT:    v_mov_b32_e32 v0, 42
1696; GCN-NEXT:    v_mov_b32_e32 v1, 0
1697; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1698; GCN-NEXT:    v_mov_b32_e32 v3, s1
1699; GCN-NEXT:    v_mov_b32_e32 v2, s0
1700; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1701; GCN-NEXT:    s_endpgm
1702  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
1703  ret void
1704}
1705
1706define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind {
1707; CI-LABEL: flat_atomic_inc_noret_i64_offset:
1708; CI:       ; %bb.0:
1709; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1710; CI-NEXT:    v_mov_b32_e32 v0, 42
1711; CI-NEXT:    v_mov_b32_e32 v1, 0
1712; CI-NEXT:    s_waitcnt lgkmcnt(0)
1713; CI-NEXT:    s_add_u32 s0, s0, 32
1714; CI-NEXT:    s_addc_u32 s1, s1, 0
1715; CI-NEXT:    v_mov_b32_e32 v3, s1
1716; CI-NEXT:    v_mov_b32_e32 v2, s0
1717; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1718; CI-NEXT:    s_endpgm
1719;
1720; VI-LABEL: flat_atomic_inc_noret_i64_offset:
1721; VI:       ; %bb.0:
1722; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1723; VI-NEXT:    v_mov_b32_e32 v0, 42
1724; VI-NEXT:    v_mov_b32_e32 v1, 0
1725; VI-NEXT:    s_waitcnt lgkmcnt(0)
1726; VI-NEXT:    s_add_u32 s0, s0, 32
1727; VI-NEXT:    s_addc_u32 s1, s1, 0
1728; VI-NEXT:    v_mov_b32_e32 v3, s1
1729; VI-NEXT:    v_mov_b32_e32 v2, s0
1730; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1731; VI-NEXT:    s_endpgm
1732;
1733; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
1734; GFX9:       ; %bb.0:
1735; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1736; GFX9-NEXT:    v_mov_b32_e32 v0, 42
1737; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1738; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1739; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1740; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1741; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
1742; GFX9-NEXT:    s_endpgm
1743;
1744; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
1745; GFX10:       ; %bb.0:
1746; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1747; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1748; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1749; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1750; GFX10-NEXT:    s_add_u32 s0, s0, 32
1751; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1752; GFX10-NEXT:    v_mov_b32_e32 v3, s1
1753; GFX10-NEXT:    v_mov_b32_e32 v2, s0
1754; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1755; GFX10-NEXT:    s_endpgm
1756  %gep = getelementptr i64, i64* %ptr, i32 4
1757  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
1758  ret void
1759}
1760
1761define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 {
1762; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
1763; CI:       ; %bb.0:
1764; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1765; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1766; CI-NEXT:    s_waitcnt lgkmcnt(0)
1767; CI-NEXT:    v_mov_b32_e32 v0, s2
1768; CI-NEXT:    v_mov_b32_e32 v1, s3
1769; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
1770; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1771; CI-NEXT:    v_mov_b32_e32 v0, s0
1772; CI-NEXT:    v_mov_b32_e32 v1, s1
1773; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1774; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1775; CI-NEXT:    v_mov_b32_e32 v2, 42
1776; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
1777; CI-NEXT:    v_mov_b32_e32 v3, 0
1778; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1779; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
1780; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1781; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1782; CI-NEXT:    s_endpgm
1783;
1784; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
1785; VI:       ; %bb.0:
1786; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1787; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1788; VI-NEXT:    s_waitcnt lgkmcnt(0)
1789; VI-NEXT:    v_mov_b32_e32 v0, s2
1790; VI-NEXT:    v_mov_b32_e32 v1, s3
1791; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
1792; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1793; VI-NEXT:    v_mov_b32_e32 v0, s0
1794; VI-NEXT:    v_mov_b32_e32 v1, s1
1795; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1796; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1797; VI-NEXT:    v_mov_b32_e32 v2, 42
1798; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
1799; VI-NEXT:    v_mov_b32_e32 v3, 0
1800; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1801; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
1802; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1803; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1804; VI-NEXT:    s_endpgm
1805;
1806; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
1807; GFX9:       ; %bb.0:
1808; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1809; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1810; GFX9-NEXT:    v_mov_b32_e32 v4, 42
1811; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1812; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1813; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1814; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1815; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
1816; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1817; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc
1818; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1819; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1820; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
1821; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1822; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1823; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1824; GFX9-NEXT:    s_endpgm
1825;
1826; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
1827; GFX10:       ; %bb.0:
1828; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1829; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1830; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1831; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1832; GFX10-NEXT:    v_mov_b32_e32 v1, s3
1833; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v4
1834; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
1835; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1836; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
1837; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1838; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1839; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1840; GFX10-NEXT:    v_mov_b32_e32 v3, s1
1841; GFX10-NEXT:    v_mov_b32_e32 v2, s0
1842; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1843; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1844; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1845; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1846; GFX10-NEXT:    s_endpgm
1847  %id = call i32 @llvm.amdgcn.workitem.id.x()
1848  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
1849  %out.gep = getelementptr i64, i64* %out, i32 %id
1850  %gep = getelementptr i64, i64* %gep.tid, i32 5
1851  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
1852  store i64 %result, i64* %out.gep
1853  ret void
1854}
1855
1856define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 {
1857; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
1858; CI:       ; %bb.0:
1859; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1860; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1861; CI-NEXT:    s_waitcnt lgkmcnt(0)
1862; CI-NEXT:    v_mov_b32_e32 v0, s0
1863; CI-NEXT:    v_mov_b32_e32 v1, s1
1864; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
1865; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1866; CI-NEXT:    v_mov_b32_e32 v0, 42
1867; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
1868; CI-NEXT:    v_mov_b32_e32 v1, 0
1869; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1870; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1871; CI-NEXT:    s_endpgm
1872;
1873; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
1874; VI:       ; %bb.0:
1875; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1876; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1877; VI-NEXT:    s_waitcnt lgkmcnt(0)
1878; VI-NEXT:    v_mov_b32_e32 v0, s0
1879; VI-NEXT:    v_mov_b32_e32 v1, s1
1880; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
1881; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1882; VI-NEXT:    v_mov_b32_e32 v0, 42
1883; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
1884; VI-NEXT:    v_mov_b32_e32 v1, 0
1885; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1886; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1887; VI-NEXT:    s_endpgm
1888;
1889; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
1890; GFX9:       ; %bb.0:
1891; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1892; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1893; GFX9-NEXT:    v_mov_b32_e32 v1, 42
1894; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1895; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1896; GFX9-NEXT:    v_mov_b32_e32 v4, s1
1897; GFX9-NEXT:    v_mov_b32_e32 v3, s0
1898; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
1899; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
1900; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
1901; GFX9-NEXT:    s_endpgm
1902;
1903; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
1904; GFX10:       ; %bb.0:
1905; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1906; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1907; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1908; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1909; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1910; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
1911; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
1912; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1913; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
1914; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1915; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1916; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
1917; GFX10-NEXT:    s_endpgm
1918  %id = call i32 @llvm.amdgcn.workitem.id.x()
1919  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
1920  %gep = getelementptr i64, i64* %gep.tid, i32 5
1921  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
1922  ret void
1923}
1924
1925define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 {
1926; CI-LABEL: nocse_lds_atomic_inc_ret_i32:
1927; CI:       ; %bb.0:
1928; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1929; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1930; CI-NEXT:    v_mov_b32_e32 v0, 42
1931; CI-NEXT:    s_mov_b32 m0, -1
1932; CI-NEXT:    s_waitcnt lgkmcnt(0)
1933; CI-NEXT:    v_mov_b32_e32 v2, s2
1934; CI-NEXT:    v_mov_b32_e32 v1, s4
1935; CI-NEXT:    ds_inc_rtn_u32 v4, v1, v0
1936; CI-NEXT:    ds_inc_rtn_u32 v5, v1, v0
1937; CI-NEXT:    v_mov_b32_e32 v0, s0
1938; CI-NEXT:    v_mov_b32_e32 v1, s1
1939; CI-NEXT:    v_mov_b32_e32 v3, s3
1940; CI-NEXT:    s_waitcnt lgkmcnt(1)
1941; CI-NEXT:    flat_store_dword v[0:1], v4
1942; CI-NEXT:    s_waitcnt lgkmcnt(0)
1943; CI-NEXT:    flat_store_dword v[2:3], v5
1944; CI-NEXT:    s_endpgm
1945;
1946; VI-LABEL: nocse_lds_atomic_inc_ret_i32:
1947; VI:       ; %bb.0:
1948; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1949; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1950; VI-NEXT:    v_mov_b32_e32 v0, 42
1951; VI-NEXT:    s_mov_b32 m0, -1
1952; VI-NEXT:    s_waitcnt lgkmcnt(0)
1953; VI-NEXT:    v_mov_b32_e32 v2, s2
1954; VI-NEXT:    v_mov_b32_e32 v1, s4
1955; VI-NEXT:    ds_inc_rtn_u32 v4, v1, v0
1956; VI-NEXT:    ds_inc_rtn_u32 v5, v1, v0
1957; VI-NEXT:    v_mov_b32_e32 v0, s0
1958; VI-NEXT:    v_mov_b32_e32 v1, s1
1959; VI-NEXT:    v_mov_b32_e32 v3, s3
1960; VI-NEXT:    s_waitcnt lgkmcnt(1)
1961; VI-NEXT:    flat_store_dword v[0:1], v4
1962; VI-NEXT:    s_waitcnt lgkmcnt(0)
1963; VI-NEXT:    flat_store_dword v[2:3], v5
1964; VI-NEXT:    s_endpgm
1965;
1966; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32:
1967; GFX9:       ; %bb.0:
1968; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1969; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1970; GFX9-NEXT:    v_mov_b32_e32 v0, 42
1971; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1972; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1973; GFX9-NEXT:    ds_inc_rtn_u32 v2, v1, v0
1974; GFX9-NEXT:    ds_inc_rtn_u32 v0, v1, v0
1975; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1976; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
1977; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
1978; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1980; GFX9-NEXT:    s_endpgm
1981;
1982; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32:
1983; GFX10:       ; %bb.0:
1984; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
1985; GFX10-NEXT:    v_mov_b32_e32 v0, 42
1986; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1987; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1988; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1989; GFX10-NEXT:    ds_inc_rtn_u32 v2, v1, v0
1990; GFX10-NEXT:    ds_inc_rtn_u32 v0, v1, v0
1991; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1992; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
1994; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
1995; GFX10-NEXT:    s_endpgm
1996  %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
1997  %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
1998
1999  store i32 %result0, i32 addrspace(1)* %out0
2000  store i32 %result1, i32 addrspace(1)* %out1
2001  ret void
2002}
2003
2004attributes #0 = { nounwind }
2005attributes #1 = { nounwind readnone }
2006attributes #2 = { nounwind argmemonly }
2007