1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
2; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
4
5; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
6; WARN-NOT: warning
7
8;
9; LD1B
10;
11
12define <vscale x 16 x i8> @ld1b_i8(<vscale x 16 x i1> %pred, i8* %addr) {
13; CHECK-LABEL: ld1b_i8:
14; CHECK: ld1b { z0.b }, p0/z, [x0]
15; CHECK-NEXT: ret
16  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pred, i8* %addr)
17  ret <vscale x 16 x i8> %res
18}
19
20define <vscale x 8 x i16> @ld1b_h(<vscale x 8 x i1> %pred, i8* %addr) {
21; CHECK-LABEL: ld1b_h:
22; CHECK: ld1b { z0.h }, p0/z, [x0]
23; CHECK-NEXT: ret
24  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)
25  %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
26  ret <vscale x 8 x i16> %res
27}
28
29define <vscale x 8 x i16> @ld1sb_h(<vscale x 8 x i1> %pred, i8* %addr) {
30; CHECK-LABEL: ld1sb_h:
31; CHECK: ld1sb { z0.h }, p0/z, [x0]
32; CHECK-NEXT: ret
33  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)
34  %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
35  ret <vscale x 8 x i16> %res
36}
37
38define <vscale x 4 x i32> @ld1b_s(<vscale x 4 x i1> %pred, i8* %addr) {
39; CHECK-LABEL: ld1b_s:
40; CHECK: ld1b { z0.s }, p0/z, [x0]
41; CHECK-NEXT: ret
42  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)
43  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
44  ret <vscale x 4 x i32> %res
45}
46
47define <vscale x 4 x i32> @ld1sb_s(<vscale x 4 x i1> %pred, i8* %addr) {
48; CHECK-LABEL: ld1sb_s:
49; CHECK: ld1sb { z0.s }, p0/z, [x0]
50; CHECK-NEXT: ret
51  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)
52  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
53  ret <vscale x 4 x i32> %res
54}
55
56define <vscale x 2 x i64> @ld1b_d(<vscale x 2 x i1> %pred, i8* %addr) {
57; CHECK-LABEL: ld1b_d:
58; CHECK: ld1b { z0.d }, p0/z, [x0]
59; CHECK-NEXT: ret
60  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)
61  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
62  ret <vscale x 2 x i64> %res
63}
64
65define <vscale x 2 x i64> @ld1sb_d(<vscale x 2 x i1> %pred, i8* %addr) {
66; CHECK-LABEL: ld1sb_d:
67; CHECK: ld1sb { z0.d }, p0/z, [x0]
68; CHECK-NEXT: ret
69  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)
70  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
71  ret <vscale x 2 x i64> %res
72}
73
74;
75; LD1H
76;
77
78define <vscale x 8 x i16> @ld1h_i16(<vscale x 8 x i1> %pred, i16* %addr) {
79; CHECK-LABEL: ld1h_i16:
80; CHECK: ld1h { z0.h }, p0/z, [x0]
81; CHECK-NEXT: ret
82  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pred, i16* %addr)
83  ret <vscale x 8 x i16> %res
84}
85
86define <vscale x 8 x half> @ld1h_f16(<vscale x 8 x i1> %pred, half* %addr) {
87; CHECK-LABEL: ld1h_f16:
88; CHECK: ld1h { z0.h }, p0/z, [x0]
89; CHECK-NEXT: ret
90  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pred, half* %addr)
91  ret <vscale x 8 x half> %res
92}
93
94define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) #0 {
95; CHECK-LABEL: ld1h_bf16:
96; CHECK: ld1h { z0.h }, p0/z, [x0]
97; CHECK-NEXT: ret
98  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
99  ret <vscale x 8 x bfloat> %res
100}
101
102define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %addr) {
103; CHECK-LABEL: ld1h_s:
104; CHECK: ld1h { z0.s }, p0/z, [x0]
105; CHECK-NEXT: ret
106  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)
107  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
108  ret <vscale x 4 x i32> %res
109}
110
111define <vscale x 4 x i32> @ld1sh_s(<vscale x 4 x i1> %pred, i16* %addr) {
112; CHECK-LABEL: ld1sh_s:
113; CHECK: ld1sh { z0.s }, p0/z, [x0]
114; CHECK-NEXT: ret
115  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)
116  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
117  ret <vscale x 4 x i32> %res
118}
119
120define <vscale x 2 x i64> @ld1h_d(<vscale x 2 x i1> %pred, i16* %addr) {
121; CHECK-LABEL: ld1h_d:
122; CHECK: ld1h { z0.d }, p0/z, [x0]
123; CHECK-NEXT: ret
124  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)
125  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
126  ret <vscale x 2 x i64> %res
127}
128
129define <vscale x 2 x i64> @ld1sh_d(<vscale x 2 x i1> %pred, i16* %addr) {
130; CHECK-LABEL: ld1sh_d:
131; CHECK: ld1sh { z0.d }, p0/z, [x0]
132; CHECK-NEXT: ret
133  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)
134  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
135  ret <vscale x 2 x i64> %res
136}
137
138;
139; LD1W
140;
141
142define <vscale x 4 x i32> @ld1w_i32(<vscale x 4 x i1> %pred, i32* %addr) {
143; CHECK-LABEL: ld1w_i32:
144; CHECK: ld1w { z0.s }, p0/z, [x0]
145; CHECK-NEXT: ret
146  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pred, i32* %addr)
147  ret <vscale x 4 x i32> %res
148}
149
150define <vscale x 4 x float> @ld1w_f32(<vscale x 4 x i1> %pred, float* %addr) {
151; CHECK-LABEL: ld1w_f32:
152; CHECK: ld1w { z0.s }, p0/z, [x0]
153; CHECK-NEXT: ret
154  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pred, float* %addr)
155  ret <vscale x 4 x float> %res
156}
157
158define <vscale x 2 x i64> @ld1w_d(<vscale x 2 x i1> %pred, i32* %addr) {
159; CHECK-LABEL: ld1w_d:
160; CHECK: ld1w { z0.d }, p0/z, [x0]
161; CHECK-NEXT: ret
162  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)
163  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
164  ret <vscale x 2 x i64> %res
165}
166
167define <vscale x 2 x i64> @ld1sw_d(<vscale x 2 x i1> %pred, i32* %addr) {
168; CHECK-LABEL: ld1sw_d:
169; CHECK: ld1sw { z0.d }, p0/z, [x0]
170; CHECK-NEXT: ret
171  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)
172  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
173  ret <vscale x 2 x i64> %res
174}
175
176;
177; LD1D
178;
179
180define <vscale x 2 x i64> @ld1d_i64(<vscale x 2 x i1> %pred, i64* %addr) {
181; CHECK-LABEL: ld1d_i64:
182; CHECK: ld1d { z0.d }, p0/z, [x0]
183; CHECK-NEXT: ret
184  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pred,
185                                                               i64* %addr)
186  ret <vscale x 2 x i64> %res
187}
188
189define <vscale x 2 x double> @ld1d_f64(<vscale x 2 x i1> %pred, double* %addr) {
190; CHECK-LABEL: ld1d_f64:
191; CHECK: ld1d { z0.d }, p0/z, [x0]
192; CHECK-NEXT: ret
193  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pred,
194                                                                  double* %addr)
195  ret <vscale x 2 x double> %res
196}
197
198declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*)
199
200declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
201declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
202declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
203declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
204
205declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
206declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
207declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, i32*)
208declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, float*)
209
210declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, i8*)
211declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, i16*)
212declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, i32*)
213declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, i64*)
214declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)
215
216; +bf16 is required for the bfloat version.
217attributes #0 = { "target-features"="+sve,+bf16" }
218