1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3
4; GCN-LABEL: ds_read32_combine_stride_400:
5; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
6; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
7
8; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
9; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
10; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
11
12; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
13; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
14; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
15
16; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
17; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
18; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
19; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
20define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
21bb:
22  %tmp = load float, float addrspace(3)* %arg, align 4
23  %tmp2 = fadd float %tmp, 0.000000e+00
24  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
25  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
26  %tmp5 = fadd float %tmp2, %tmp4
27  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
28  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
29  %tmp8 = fadd float %tmp5, %tmp7
30  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
31  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
32  %tmp11 = fadd float %tmp8, %tmp10
33  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
34  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
35  %tmp14 = fadd float %tmp11, %tmp13
36  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
37  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
38  %tmp17 = fadd float %tmp14, %tmp16
39  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
40  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
41  %tmp20 = fadd float %tmp17, %tmp19
42  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
43  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
44  %tmp23 = fadd float %tmp20, %tmp22
45  store float %tmp23, float *%arg1, align 4
46  ret void
47}
48
49; GCN-LABEL: ds_read32_combine_stride_400_back:
50; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
51; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
52
53; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
54; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
55; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
56
57; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
58; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
59; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
60
61; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
62; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
63; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
64; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
65define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
66bb:
67  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
68  %tmp2 = load float, float addrspace(3)* %tmp, align 4
69  %tmp3 = fadd float %tmp2, 0.000000e+00
70  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
71  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
72  %tmp6 = fadd float %tmp3, %tmp5
73  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
74  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
75  %tmp9 = fadd float %tmp6, %tmp8
76  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
77  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
78  %tmp12 = fadd float %tmp9, %tmp11
79  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
80  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
81  %tmp15 = fadd float %tmp12, %tmp14
82  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
83  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
84  %tmp18 = fadd float %tmp15, %tmp17
85  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
86  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
87  %tmp21 = fadd float %tmp18, %tmp20
88  %tmp22 = load float, float addrspace(3)* %arg, align 4
89  %tmp23 = fadd float %tmp21, %tmp22
90  store float %tmp23, float *%arg1, align 4
91  ret void
92}
93
94; GCN-LABEL: ds_read32_combine_stride_8192:
95; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
96; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
97; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32
98; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
99; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
100; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
101define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
102bb:
103  %tmp = load float, float addrspace(3)* %arg, align 4
104  %tmp2 = fadd float %tmp, 0.000000e+00
105  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
106  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
107  %tmp5 = fadd float %tmp2, %tmp4
108  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
109  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
110  %tmp8 = fadd float %tmp5, %tmp7
111  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
112  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
113  %tmp11 = fadd float %tmp8, %tmp10
114  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
115  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
116  %tmp14 = fadd float %tmp11, %tmp13
117  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
118  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
119  %tmp17 = fadd float %tmp14, %tmp16
120  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
121  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
122  %tmp20 = fadd float %tmp17, %tmp19
123  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
124  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
125  %tmp23 = fadd float %tmp20, %tmp22
126  store float %tmp23, float *%arg1, align 4
127  ret void
128}
129
130; GCN-LABEL: ds_read32_combine_stride_8192_shifted:
131; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
132; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
133
134; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
135; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
136; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
137
138; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
139; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
140; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
141
142; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
143; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
144; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
145define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
146bb:
147  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
148  %tmp2 = load float, float addrspace(3)* %tmp, align 4
149  %tmp3 = fadd float %tmp2, 0.000000e+00
150  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
151  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
152  %tmp6 = fadd float %tmp3, %tmp5
153  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
154  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
155  %tmp9 = fadd float %tmp6, %tmp8
156  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
157  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
158  %tmp12 = fadd float %tmp9, %tmp11
159  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
160  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
161  %tmp15 = fadd float %tmp12, %tmp14
162  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
163  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
164  %tmp18 = fadd float %tmp15, %tmp17
165  store float %tmp18, float *%arg1, align 4
166  ret void
167}
168
169; GCN-LABEL: ds_read64_combine_stride_400:
170; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
171; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
172
173; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
174; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
175
176; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
177; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
178; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
179; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
180define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
181bb:
182  %tmp = load double, double addrspace(3)* %arg, align 8
183  %tmp2 = fadd double %tmp, 0.000000e+00
184  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
185  %tmp4 = load double, double addrspace(3)* %tmp3, align 8
186  %tmp5 = fadd double %tmp2, %tmp4
187  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
188  %tmp7 = load double, double addrspace(3)* %tmp6, align 8
189  %tmp8 = fadd double %tmp5, %tmp7
190  %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
191  %tmp10 = load double, double addrspace(3)* %tmp9, align 8
192  %tmp11 = fadd double %tmp8, %tmp10
193  %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
194  %tmp13 = load double, double addrspace(3)* %tmp12, align 8
195  %tmp14 = fadd double %tmp11, %tmp13
196  %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
197  %tmp16 = load double, double addrspace(3)* %tmp15, align 8
198  %tmp17 = fadd double %tmp14, %tmp16
199  %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
200  %tmp19 = load double, double addrspace(3)* %tmp18, align 8
201  %tmp20 = fadd double %tmp17, %tmp19
202  %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
203  %tmp22 = load double, double addrspace(3)* %tmp21, align 8
204  %tmp23 = fadd double %tmp20, %tmp22
205  store double %tmp23, double *%arg1, align 8
206  ret void
207}
208
209; GCN-LABEL: ds_read64_combine_stride_8192_shifted:
210; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
211; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
212
213; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
214; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
215; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
216
217; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
218; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
219; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
220
221; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
222; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
223; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
224define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
225bb:
226  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
227  %tmp2 = load double, double addrspace(3)* %tmp, align 8
228  %tmp3 = fadd double %tmp2, 0.000000e+00
229  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
230  %tmp5 = load double, double addrspace(3)* %tmp4, align 8
231  %tmp6 = fadd double %tmp3, %tmp5
232  %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
233  %tmp8 = load double, double addrspace(3)* %tmp7, align 8
234  %tmp9 = fadd double %tmp6, %tmp8
235  %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
236  %tmp11 = load double, double addrspace(3)* %tmp10, align 8
237  %tmp12 = fadd double %tmp9, %tmp11
238  %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
239  %tmp14 = load double, double addrspace(3)* %tmp13, align 8
240  %tmp15 = fadd double %tmp12, %tmp14
241  %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
242  %tmp17 = load double, double addrspace(3)* %tmp16, align 8
243  %tmp18 = fadd double %tmp15, %tmp17
244  store double %tmp18, double *%arg1, align 8
245  ret void
246}
247
248; GCN-LABEL: ds_write32_combine_stride_400:
249; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
250; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
251
252; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
253; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
254; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
255
256; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
257; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
258; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
259
260; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
261; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
262; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
263; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
264define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
265bb:
266  store float 1.000000e+00, float addrspace(3)* %arg, align 4
267  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
268  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
269  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
270  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
271  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
272  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
273  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
274  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
275  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
276  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
277  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
278  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
279  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
280  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
281  ret void
282}
283
284; GCN-LABEL: ds_write32_combine_stride_400_back:
285; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
286; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
287
288; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
289; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
290; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
291
292; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
293; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
294; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
295
296; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
297; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
298; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
299; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
300define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
301bb:
302  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
303  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
304  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
305  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
306  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
307  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
308  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
309  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
310  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
311  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
312  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
313  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
314  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
315  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
316  store float 1.000000e+00, float addrspace(3)* %arg, align 4
317  ret void
318}
319
320; GCN-LABEL: ds_write32_combine_stride_8192:
321; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
322; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
323; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
324; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
325; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
326; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
327define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
328bb:
329  store float 1.000000e+00, float addrspace(3)* %arg, align 4
330  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
331  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
332  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
333  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
334  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
335  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
336  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
337  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
338  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
339  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
340  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
341  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
342  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
343  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
344  ret void
345}
346
347; GCN-LABEL: ds_write32_combine_stride_8192_shifted:
348; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
349; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
350
351; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
352; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
353; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
354
355; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
356; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
357; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]]
358
359; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
360; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
361; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
362define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
363bb:
364  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
365  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
366  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
367  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
368  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
369  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
370  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
371  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
372  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
373  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
374  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
375  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
376  ret void
377}
378
379; GCN-LABEL: ds_write64_combine_stride_400:
380; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
381; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
382
383; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
384; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
385
386; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
387; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
388; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
389; GCN-DAG: ds_write2_b64 [[B1]],   v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
390define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
391bb:
392  store double 1.000000e+00, double addrspace(3)* %arg, align 8
393  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
394  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
395  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
396  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
397  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
398  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
399  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
400  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
401  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
402  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
403  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
404  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
405  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
406  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
407  ret void
408}
409
410; GCN-LABEL: ds_write64_combine_stride_8192_shifted:
411; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
412; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
413
414; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
415; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
416; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
417
418; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
419; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
420; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
421
422; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
423; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
424; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
425define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
426bb:
427  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
428  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
429  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
430  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
431  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
432  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
433  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
434  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
435  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
436  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
437  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
438  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
439  ret void
440}
441