1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
3
4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
5; WARN-NOT: warning
6
7;
8; LD1B
9;
10
11define <vscale x 16 x i8> @ld1b_upper_bound(<vscale x 16 x i1> %pg, i8* %a) {
12; CHECK-LABEL: ld1b_upper_bound:
13; CHECK: ld1b { z0.b }, p0/z, [x0, #7, mul vl]
14; CHECK-NEXT: ret
15  %base_scalable = bitcast i8* %a to <vscale x 16 x i8>*
16  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 7
17  %base_scalar = bitcast <vscale x 16 x i8>* %base to i8*
18  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar)
19  ret <vscale x 16 x i8> %load
20}
21
22define <vscale x 16 x i8> @ld1b_inbound(<vscale x 16 x i1> %pg, i8* %a) {
23; CHECK-LABEL: ld1b_inbound:
24; CHECK: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
25; CHECK-NEXT: ret
26  %base_scalable = bitcast i8* %a to <vscale x 16 x i8>*
27  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 1
28  %base_scalar = bitcast <vscale x 16 x i8>* %base to i8*
29  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar)
30  ret <vscale x 16 x i8> %load
31}
32
33define <vscale x 4 x i32> @ld1b_s_inbound(<vscale x 4 x i1> %pg, i8* %a) {
34; CHECK-LABEL: ld1b_s_inbound:
35; CHECK: ld1b { z0.s }, p0/z, [x0, #7, mul vl]
36; CHECK-NEXT: ret
37  %base_scalable = bitcast i8* %a to <vscale x 4 x i8>*
38  %base = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base_scalable, i64 7
39  %base_scalar = bitcast <vscale x 4 x i8>* %base to i8*
40  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, i8* %base_scalar)
41  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
42  ret <vscale x 4 x i32> %res
43}
44
45define <vscale x 4 x i32> @ld1sb_s_inbound(<vscale x 4 x i1> %pg, i8* %a) {
46; CHECK-LABEL: ld1sb_s_inbound:
47; CHECK: ld1sb { z0.s }, p0/z, [x0, #7, mul vl]
48; CHECK-NEXT: ret
49  %base_scalable = bitcast i8* %a to <vscale x 4 x i8>*
50  %base = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base_scalable, i64 7
51  %base_scalar = bitcast <vscale x 4 x i8>* %base to i8*
52  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, i8* %base_scalar)
53  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
54  ret <vscale x 4 x i32> %res
55}
56
57define <vscale x 16 x i8> @ld1b_lower_bound(<vscale x 16 x i1> %pg, i8* %a) {
58; CHECK-LABEL: ld1b_lower_bound:
59; CHECK: ld1b { z0.b }, p0/z, [x0, #-8, mul vl]
60; CHECK-NEXT: ret
61  %base_scalable = bitcast i8* %a to <vscale x 16 x i8>*
62  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 -8
63  %base_scalar = bitcast <vscale x 16 x i8>* %base to i8*
64  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar)
65  ret <vscale x 16 x i8> %load
66}
67
68define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i1> %pg, i8* %a) {
69; CHECK-LABEL: ld1b_out_of_upper_bound:
70; CHECK: rdvl x[[OFFSET:[0-9]+]], #8
71; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x[[OFFSET]]]
72; CHECK-NEXT:  ret
73  %base_scalable = bitcast i8* %a to <vscale x 16 x i8>*
74  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 8
75  %base_scalar = bitcast <vscale x 16 x i8>* %base to i8*
76  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar)
77  ret <vscale x 16 x i8> %load
78}
79
80define <vscale x 16 x i8> @ld1b_out_of_lower_bound(<vscale x 16 x i1> %pg, i8* %a) {
81; CHECK-LABEL: ld1b_out_of_lower_bound:
82; CHECK: rdvl x[[OFFSET:[0-9]+]], #-9
83; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x[[OFFSET]]]
84; CHECK-NEXT:  ret
85  %base_scalable = bitcast i8* %a to <vscale x 16 x i8>*
86  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 -9
87  %base_scalar = bitcast <vscale x 16 x i8>* %base to i8*
88  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar)
89  ret <vscale x 16 x i8> %load
90}
91
92;
93; LD1H
94;
95
96define <vscale x 8 x i16> @ld1b_h_inbound(<vscale x 8 x i1> %pg, i8* %a) {
97; CHECK-LABEL: ld1b_h_inbound:
98; CHECK: ld1b { z0.h }, p0/z, [x0, #7, mul vl]
99; CHECK-NEXT: ret
100  %base_scalable = bitcast i8* %a to <vscale x 8 x i8>*
101  %base = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base_scalable, i64 7
102  %base_scalar = bitcast <vscale x 8 x i8>* %base to i8*
103  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, i8* %base_scalar)
104  %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
105  ret <vscale x 8 x i16> %res
106}
107
108define <vscale x 8 x i16> @ld1sb_h_inbound(<vscale x 8 x i1> %pg, i8* %a) {
109; CHECK-LABEL: ld1sb_h_inbound:
110; CHECK: ld1sb { z0.h }, p0/z, [x0, #7, mul vl]
111; CHECK-NEXT: ret
112  %base_scalable = bitcast i8* %a to <vscale x 8 x i8>*
113  %base = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base_scalable, i64 7
114  %base_scalar = bitcast <vscale x 8 x i8>* %base to i8*
115  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, i8* %base_scalar)
116  %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
117  ret <vscale x 8 x i16> %res
118}
119
120define <vscale x 8 x i16> @ld1h_inbound(<vscale x 8 x i1> %pg, i16* %a) {
121; CHECK-LABEL: ld1h_inbound:
122; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
123; CHECK-NEXT: ret
124  %base_scalable = bitcast i16* %a to <vscale x 8 x i16>*
125  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base_scalable, i64 1
126  %base_scalar = bitcast <vscale x 8 x i16>* %base to i16*
127  %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pg, i16* %base_scalar)
128  ret <vscale x 8 x i16> %load
129}
130
131define <vscale x 4 x i32> @ld1h_s_inbound(<vscale x 4 x i1> %pg, i16* %a) {
132; CHECK-LABEL: ld1h_s_inbound:
133; CHECK: ld1h { z0.s }, p0/z, [x0, #7, mul vl]
134; CHECK-NEXT: ret
135  %base_scalable = bitcast i16* %a to <vscale x 4 x i16>*
136  %base = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base_scalable, i64 7
137  %base_scalar = bitcast <vscale x 4 x i16>* %base to i16*
138  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, i16* %base_scalar)
139  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
140  ret <vscale x 4 x i32> %res
141}
142
143define <vscale x 4 x i32> @ld1sh_s_inbound(<vscale x 4 x i1> %pg, i16* %a) {
144; CHECK-LABEL: ld1sh_s_inbound:
145; CHECK: ld1sh { z0.s }, p0/z, [x0, #7, mul vl]
146; CHECK-NEXT: ret
147  %base_scalable = bitcast i16* %a to <vscale x 4 x i16>*
148  %base = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base_scalable, i64 7
149  %base_scalar = bitcast <vscale x 4 x i16>* %base to i16*
150  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, i16* %base_scalar)
151  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
152  ret <vscale x 4 x i32> %res
153}
154
155define <vscale x 2 x i64> @ld1b_d_inbound(<vscale x 2 x i1> %pg, i8* %a) {
156; CHECK-LABEL: ld1b_d_inbound:
157; CHECK: ld1b { z0.d }, p0/z, [x0, #7, mul vl]
158; CHECK-NEXT: ret
159  %base_scalable = bitcast i8* %a to <vscale x 2 x i8>*
160  %base = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base_scalable, i64 7
161  %base_scalar = bitcast <vscale x 2 x i8>* %base to i8*
162  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, i8* %base_scalar)
163  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
164  ret <vscale x 2 x i64> %res
165}
166
167define <vscale x 2 x i64> @ld1sb_d_inbound(<vscale x 2 x i1> %pg, i8* %a) {
168; CHECK-LABEL: ld1sb_d_inbound:
169; CHECK: ld1sb { z0.d }, p0/z, [x0, #7, mul vl]
170; CHECK-NEXT: ret
171  %base_scalable = bitcast i8* %a to <vscale x 2 x i8>*
172  %base = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base_scalable, i64 7
173  %base_scalar = bitcast <vscale x 2 x i8>* %base to i8*
174  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, i8* %base_scalar)
175  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
176  ret <vscale x 2 x i64> %res
177}
178
179define <vscale x 2 x i64> @ld1h_d_inbound(<vscale x 2 x i1> %pg, i16* %a) {
180; CHECK-LABEL: ld1h_d_inbound:
181; CHECK: ld1h { z0.d }, p0/z, [x0, #7, mul vl]
182; CHECK-NEXT: ret
183  %base_scalable = bitcast i16* %a to <vscale x 2 x i16>*
184  %base = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base_scalable, i64 7
185  %base_scalar = bitcast <vscale x 2 x i16>* %base to i16*
186  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, i16* %base_scalar)
187  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
188  ret <vscale x 2 x i64> %res
189}
190
191define <vscale x 2 x i64> @ld1sh_d_inbound(<vscale x 2 x i1> %pg, i16* %a) {
192; CHECK-LABEL: ld1sh_d_inbound:
193; CHECK: ld1sh { z0.d }, p0/z, [x0, #7, mul vl]
194; CHECK-NEXT: ret
195  %base_scalable = bitcast i16* %a to <vscale x 2 x i16>*
196  %base = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base_scalable, i64 7
197  %base_scalar = bitcast <vscale x 2 x i16>* %base to i16*
198  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, i16* %base_scalar)
199  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
200  ret <vscale x 2 x i64> %res
201}
202
203define <vscale x 8 x half> @ld1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a) {
204; CHECK-LABEL: ld1h_f16_inbound:
205; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
206; CHECK-NEXT: ret
207  %base_scalable = bitcast half* %a to <vscale x 8 x half>*
208  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base_scalable, i64 1
209  %base_scalar = bitcast <vscale x 8 x half>* %base to half*
210  %load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pg, half* %base_scalar)
211  ret <vscale x 8 x half> %load
212}
213
214define <vscale x 8 x bfloat> @ld1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) #0 {
215; CHECK-LABEL: ld1h_bf16_inbound:
216; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
217; CHECK-NEXT: ret
218  %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
219  %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1
220  %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
221  %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar)
222  ret <vscale x 8 x bfloat> %load
223}
224
225;
226; LD1W
227;
228
229define <vscale x 4 x i32> @ld1w_inbound(<vscale x 4 x i1> %pg, i32* %a) {
230; CHECK-LABEL: ld1w_inbound:
231; CHECK: ld1w { z0.s }, p0/z, [x0, #7, mul vl]
232; CHECK-NEXT: ret
233  %base_scalable = bitcast i32* %a to <vscale x 4 x i32>*
234  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base_scalable, i64 7
235  %base_scalar = bitcast <vscale x 4 x i32>* %base to i32*
236  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pg, i32* %base_scalar)
237  ret <vscale x 4 x i32> %load
238}
239
240define <vscale x 4 x float> @ld1w_f32_inbound(<vscale x 4 x i1> %pg, float* %a) {
241; CHECK-LABEL: ld1w_f32_inbound:
242; CHECK: ld1w { z0.s }, p0/z, [x0, #7, mul vl]
243; CHECK-NEXT: ret
244  %base_scalable = bitcast float* %a to <vscale x 4 x float>*
245  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base_scalable, i64 7
246  %base_scalar = bitcast <vscale x 4 x float>* %base to float*
247  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pg, float* %base_scalar)
248  ret <vscale x 4 x float> %load
249}
250
251;
252; LD1D
253;
254
255define <vscale x 2 x i64> @ld1d_inbound(<vscale x 2 x i1> %pg, i64* %a) {
256; CHECK-LABEL: ld1d_inbound:
257; CHECK: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
258; CHECK-NEXT: ret
259  %base_scalable = bitcast i64* %a to <vscale x 2 x i64>*
260  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base_scalable, i64 1
261  %base_scalar = bitcast <vscale x 2 x i64>* %base to i64*
262  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pg, i64* %base_scalar)
263  ret <vscale x 2 x i64> %load
264}
265
266define <vscale x 2 x i64> @ld1w_d_inbound(<vscale x 2 x i1> %pg, i32* %a) {
267; CHECK-LABEL: ld1w_d_inbound:
268; CHECK: ld1w { z0.d }, p0/z, [x0, #7, mul vl]
269; CHECK-NEXT: ret
270  %base_scalable = bitcast i32* %a to <vscale x 2 x i32>*
271  %base = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base_scalable, i64 7
272  %base_scalar = bitcast <vscale x 2 x i32>* %base to i32*
273  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, i32* %base_scalar)
274  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
275  ret <vscale x 2 x i64> %res
276}
277
278define <vscale x 2 x i64> @ld1sw_d_inbound(<vscale x 2 x i1> %pg, i32* %a) {
279; CHECK-LABEL: ld1sw_d_inbound:
280; CHECK: ld1sw { z0.d }, p0/z, [x0, #7, mul vl]
281; CHECK-NEXT: ret
282  %base_scalable = bitcast i32* %a to <vscale x 2 x i32>*
283  %base = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base_scalable, i64 7
284  %base_scalar = bitcast <vscale x 2 x i32>* %base to i32*
285  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, i32* %base_scalar)
286  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
287  ret <vscale x 2 x i64> %res
288}
289
290define <vscale x 2 x double> @ld1d_f64_inbound(<vscale x 2 x i1> %pg, double* %a) {
291; CHECK-LABEL: ld1d_f64_inbound:
292; CHECK: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
293; CHECK-NEXT: ret
294  %base_scalable = bitcast double* %a to <vscale x 2 x double>*
295  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base_scalable, i64 1
296  %base_scalar = bitcast <vscale x 2 x double>* %base to double*
297  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pg, double* %base_scalar)
298  ret <vscale x 2 x double> %load
299}
300
301declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*)
302
303declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
304declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
305declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
306declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
307
308declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
309declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
310declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, i32*)
311declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, float*)
312
313declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, i8*)
314declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, i16*)
315declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, i32*)
316declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, i64*)
317declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)
318
319; +bf16 is required for the bfloat version.
320attributes #0 = { "target-features"="+sve,+bf16" }
321