1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
3
4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
5; WARN-NOT: warning
6
7; Range checks: for all the instruction tested in this file, the
8; immediate must be within the range [-8, 7] (4-bit immediate). Out of
9; range values are tested only in one case (following). Valid values
10; are tested all through the rest of the file.
11
12define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
13; CHECK-LABEL: imm_out_of_range:
14; CHECK-NEXT: rdvl x8, #8
15; CHECK-NEXT: add x8, x0, x8
16; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}]
17; CHECK-NEXT: rdvl x8, #-9
18; CHECK-NEXT: add x8, x0, x8
19; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}]
20; CHECK-NEXT: ret
21  %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
22  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
23                                                            i32 1,
24                                                            <vscale x 2 x i1> %mask,
25                                                            <vscale x 2 x i64> undef)
26  %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
27  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
28                                       <vscale x 2 x i64>* %base_store,
29                                       i32 1,
30                                       <vscale x 2 x i1> %mask)
31  ret void
32}
33
34; 2-lane contiguous load/stores
35
36define void @test_masked_ldst_sv2i8(<vscale x 2 x i8> * %base, <vscale x 2 x i1> %mask) nounwind {
37; CHECK-LABEL: test_masked_ldst_sv2i8:
38; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
39; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl]
40; CHECK-NEXT: ret
41  %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -8
42  %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
43                                                          i32 1,
44                                                          <vscale x 2 x i1> %mask,
45                                                          <vscale x 2 x i8> undef)
46  %base_store = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8> * %base, i64 -7
47  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
48                                      <vscale x 2 x i8>* %base_store,
49                                      i32 1,
50                                      <vscale x 2 x i1> %mask)
51  ret void
52}
53
54define void @test_masked_ldst_sv2i16(<vscale x 2 x i16> * %base, <vscale x 2 x i1> %mask) nounwind {
55; CHECK-LABEL: test_masked_ldst_sv2i16:
56; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
57; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
58; CHECK-NEXT: ret
59  %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 -8
60  %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
61                                                            i32 1,
62                                                            <vscale x 2 x i1> %mask,
63                                                            <vscale x 2 x i16> undef)
64  %base_store = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16> * %base, i64 -7
65  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
66                                       <vscale x 2 x i16>* %base_store,
67                                       i32 1,
68                                       <vscale x 2 x i1> %mask)
69  ret void
70}
71
72
73define void @test_masked_ldst_sv2i32(<vscale x 2 x i32> * %base, <vscale x 2 x i1> %mask) nounwind {
74; CHECK-LABEL: test_masked_ldst_sv2i32:
75; CHECK-NEXT: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
76; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
77; CHECK-NEXT: ret
78  %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -8
79  %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
80                                                            i32 1,
81                                                            <vscale x 2 x i1> %mask,
82                                                            <vscale x 2 x i32> undef)
83  %base_store = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32> * %base, i64 -7
84  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
85                                       <vscale x 2 x i32>* %base_store,
86                                       i32 1,
87                                       <vscale x 2 x i1> %mask)
88  ret void
89}
90
91define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
92; CHECK-LABEL: test_masked_ldst_sv2i64:
93; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
94; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl]
95; CHECK-NEXT: ret
96  %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
97  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
98                                                            i32 1,
99                                                            <vscale x 2 x i1> %mask,
100                                                            <vscale x 2 x i64> undef)
101  %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
102  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
103                                       <vscale x 2 x i64>* %base_store,
104                                       i32 1,
105                                       <vscale x 2 x i1> %mask)
106  ret void
107}
108
109define void @test_masked_ldst_sv2f16(<vscale x 2 x half> * %base, <vscale x 2 x i1> %mask) nounwind {
110; CHECK-LABEL: test_masked_ldst_sv2f16:
111; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
112; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
113; CHECK-NEXT: ret
114  %base_load = getelementptr <vscale x 2 x half>, <vscale x 2 x half>* %base, i64 -8
115  %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_load,
116                                                             i32 1,
117                                                             <vscale x 2 x i1> %mask,
118                                                             <vscale x 2 x half> undef)
119  %base_store = getelementptr <vscale x 2 x half>, <vscale x 2 x half> * %base, i64 -7
120  call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
121                                       <vscale x 2 x half>* %base_store,
122                                       i32 1,
123                                       <vscale x 2 x i1> %mask)
124  ret void
125}
126
127
128define void @test_masked_ldst_sv2f32(<vscale x 2 x float> * %base, <vscale x 2 x i1> %mask) nounwind {
129; CHECK-LABEL: test_masked_ldst_sv2f32:
130; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
131; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
132; CHECK-NEXT: ret
133  %base_load = getelementptr <vscale x 2 x float>, <vscale x 2 x float>* %base, i64 -8
134  %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_load,
135                                                              i32 1,
136                                                              <vscale x 2 x i1> %mask,
137                                                              <vscale x 2 x float> undef)
138  %base_store = getelementptr <vscale x 2 x float>, <vscale x 2 x float> * %base, i64 -7
139  call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
140                                       <vscale x 2 x float>* %base_store,
141                                       i32 1,
142                                       <vscale x 2 x i1> %mask)
143  ret void
144}
145
146define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
147; CHECK-LABEL: test_masked_ldst_sv2f64:
148; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl]
149; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl]
150; CHECK-NEXT: ret
151  %base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
152  %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_load,
153                                                               i32 1,
154                                                               <vscale x 2 x i1> %mask,
155                                                               <vscale x 2 x double> undef)
156  %base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
157  call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
158                                       <vscale x 2 x double>* %base_store,
159                                       i32 1,
160                                       <vscale x 2 x i1> %mask)
161  ret void
162}
163
164; 2-lane zero/sign extended contiguous loads.
165
166define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
167; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
168; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
169; CHECK-NEXT: ret
170  %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -4
171  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
172                                                          i32 1,
173                                                          <vscale x 2 x i1> %mask,
174                                                          <vscale x 2 x i8> undef)
175  %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
176  ret <vscale x 2 x i64> %ext
177}
178
179define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
180; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
181; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
182; CHECK-NEXT: ret
183  %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -3
184  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
185                                                          i32 1,
186                                                          <vscale x 2 x i1> %mask,
187                                                          <vscale x 2 x i8> undef)
188  %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
189  ret <vscale x 2 x i64> %ext
190}
191
192define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
193; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
194; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl]
195; CHECK-NEXT: ret
196  %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 1
197  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
198                                                            i32 1,
199                                                            <vscale x 2 x i1> %mask,
200                                                            <vscale x 2 x i16> undef)
201  %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
202  ret <vscale x 2 x i64> %ext
203}
204
205define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
206; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
207; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
208; CHECK-NEXT: ret
209  %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 2
210  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
211                                                            i32 1,
212                                                            <vscale x 2 x i1> %mask,
213                                                            <vscale x 2 x i16> undef)
214  %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
215  ret <vscale x 2 x i64> %ext
216}
217
218define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
219; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
220; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
221; CHECK-NEXT: ret
222  %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -2
223  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
224                                                            i32 1,
225                                                            <vscale x 2 x i1> %mask,
226                                                            <vscale x 2 x i32> undef)
227  %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
228  ret <vscale x 2 x i64> %ext
229}
230
231define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
232; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
233; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
234; CHECK-NEXT: ret
235  %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -1
236  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
237                                                            i32 1,
238                                                            <vscale x 2 x i1> %mask,
239                                                            <vscale x 2 x i32> undef)
240  %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
241  ret <vscale x 2 x i64> %ext
242}
243
244; 2-lane truncating contiguous stores.
245
246define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, <vscale x 2 x i8> *%base, <vscale x 2 x i1> %mask) nounwind {
247; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
248; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl]
249; CHECK-NEXT: ret
250  %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 3
251  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
252  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
253                                      <vscale x 2 x i8> *%base_load,
254                                      i32 1,
255                                      <vscale x 2 x i1> %mask)
256  ret void
257}
258
259
260define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, <vscale x 2 x i16> *%base, <vscale x 2 x i1> %mask) nounwind {
261; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
262; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl]
263; CHECK-NEXT: ret
264  %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 4
265  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
266  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
267                                       <vscale x 2 x i16> *%base_load,
268                                       i32 1,
269                                       <vscale x 2 x i1> %mask)
270  ret void
271}
272
273define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, <vscale x 2 x i32> *%base, <vscale x 2 x i1> %mask) nounwind {
274; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
275; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl]
276; CHECK-NEXT: ret
277  %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 5
278  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
279  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
280                                       <vscale x 2 x i32> *%base_load,
281                                       i32 1,
282                                       <vscale x 2 x i1> %mask)
283  ret void
284}
285
286; 4-lane contiguous load/stores.
287
288define void @test_masked_ldst_sv4i8(<vscale x 4 x i8> * %base, <vscale x 4 x i1> %mask) nounwind {
289; CHECK-LABEL: test_masked_ldst_sv4i8:
290; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
291; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl]
292; CHECK-NEXT: ret
293  %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -1
294  %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
295                                                          i32 1,
296                                                          <vscale x 4 x i1> %mask,
297                                                          <vscale x 4 x i8> undef)
298  %base_store = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8> * %base, i64 2
299  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
300                                      <vscale x 4 x i8>* %base_store,
301                                      i32 1,
302                                      <vscale x 4 x i1> %mask)
303  ret void
304}
305
306define void @test_masked_ldst_sv4i16(<vscale x 4 x i16> * %base, <vscale x 4 x i1> %mask) nounwind {
307; CHECK-LABEL: test_masked_ldst_sv4i16:
308; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
309; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
310; CHECK-NEXT: ret
311  %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 -1
312  %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
313                                                            i32 1,
314                                                            <vscale x 4 x i1> %mask,
315                                                            <vscale x 4 x i16> undef)
316  %base_store = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16> * %base, i64 2
317  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
318                                       <vscale x 4 x i16>* %base_store,
319                                       i32 1,
320                                       <vscale x 4 x i1> %mask)
321  ret void
322}
323
324define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
325; CHECK-LABEL: test_masked_ldst_sv4i32:
326; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl]
327; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl]
328; CHECK-NEXT: ret
329  %base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
330  %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_load,
331                                                            i32 1,
332                                                            <vscale x 4 x i1> %mask,
333                                                            <vscale x 4 x i32> undef)
334  %base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
335  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
336                                       <vscale x 4 x i32>* %base_store,
337                                       i32 1,
338                                       <vscale x 4 x i1> %mask)
339  ret void
340}
341
342define void @test_masked_ldst_sv4f16(<vscale x 4 x half> * %base, <vscale x 4 x i1> %mask) nounwind {
343; CHECK-LABEL: test_masked_ldst_sv4f16:
344; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
345; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
346; CHECK-NEXT: ret
347  %base_load = getelementptr <vscale x 4 x half>, <vscale x 4 x half>* %base, i64 -1
348  %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_load,
349                                                             i32 1,
350                                                             <vscale x 4 x i1> %mask,
351                                                             <vscale x 4 x half> undef)
352  %base_store = getelementptr <vscale x 4 x half>, <vscale x 4 x half> * %base, i64 2
353  call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
354                                       <vscale x 4 x half>* %base_store,
355                                       i32 1,
356                                       <vscale x 4 x i1> %mask)
357  ret void
358}
359
360define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
361; CHECK-LABEL: test_masked_ldst_sv4f32:
362; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
363; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl]
364; CHECK-NEXT: ret
365  %base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
366  %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_load,
367                                                              i32 1,
368                                                              <vscale x 4 x i1> %mask,
369                                                              <vscale x 4 x float> undef)
370  %base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
371  call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
372                                       <vscale x 4 x float>* %base_store,
373                                       i32 1,
374                                       <vscale x 4 x i1> %mask)
375  ret void
376}
377
378; 4-lane zero/sign extended contiguous loads.
379
380define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
381; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
382; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
383; CHECK-NEXT: ret
384  %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -4
385  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
386                                                          i32 1,
387                                                          <vscale x 4 x i1> %mask,
388                                                          <vscale x 4 x i8> undef)
389  %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
390  ret <vscale x 4 x i32> %ext
391}
392
393define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
394; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
395; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
396; CHECK-NEXT: ret
397  %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -3
398  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
399                                                          i32 1,
400                                                          <vscale x 4 x i1> %mask,
401                                                          <vscale x 4 x i8> undef)
402  %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
403  ret <vscale x 4 x i32> %ext
404}
405
406define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
407; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
408; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl]
409; CHECK-NEXT: ret
410  %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 1
411  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
412                                                            i32 1,
413                                                            <vscale x 4 x i1> %mask,
414                                                            <vscale x 4 x i16> undef)
415  %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
416  ret <vscale x 4 x i32> %ext
417}
418
419define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
420; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
421; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
422; CHECK-NEXT: ret
423  %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 2
424  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
425                                                            i32 1,
426                                                            <vscale x 4 x i1> %mask,
427                                                            <vscale x 4 x i16> undef)
428  %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
429  ret <vscale x 4 x i32> %ext
430}
431
432; 4-lane truncating contiguous stores.
433
434define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, <vscale x 4 x i8> *%base, <vscale x 4 x i1> %mask) nounwind {
435; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
436; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl]
437; CHECK-NEXT: ret
438  %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 3
439  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
440  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
441                                      <vscale x 4 x i8> *%base_load,
442                                      i32 1,
443                                      <vscale x 4 x i1> %mask)
444  ret void
445}
446
447
448define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, <vscale x 4 x i16> *%base, <vscale x 4 x i1> %mask) nounwind {
449; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
450; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl]
451; CHECK-NEXT: ret
452  %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 4
453  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
454  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
455                                       <vscale x 4 x i16> *%base_load,
456                                       i32 1,
457                                       <vscale x 4 x i1> %mask)
458  ret void
459}
460
461; 8-lane contiguous load/stores.
462
463define void @test_masked_ldst_sv8i8(<vscale x 8 x i8> * %base, <vscale x 8 x i1> %mask) nounwind {
464; CHECK-LABEL: test_masked_ldst_sv8i8:
465; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
466; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl]
467; CHECK-NEXT: ret
468  %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 6
469  %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
470                                                          i32 1,
471                                                          <vscale x 8 x i1> %mask,
472                                                          <vscale x 8 x i8> undef)
473  %base_store = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8> * %base, i64 7
474  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
475                                      <vscale x 8 x i8>* %base_store,
476                                      i32 1,
477                                      <vscale x 8 x i1> %mask)
478  ret void
479}
480
481define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
482; CHECK-LABEL: test_masked_ldst_sv8i16:
483; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
484; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl]
485; CHECK-NEXT: ret
486  %base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
487  %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_load,
488                                                            i32 1,
489                                                            <vscale x 8 x i1> %mask,
490                                                            <vscale x 8 x i16> undef)
491  %base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
492  call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
493                                       <vscale x 8 x i16>* %base_store,
494                                       i32 1,
495                                       <vscale x 8 x i1> %mask)
496  ret void
497}
498
499define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
500; CHECK-LABEL: test_masked_ldst_sv8f16:
501; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
502; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
503; CHECK-NEXT: ret
504  %base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
505  %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_load,
506                                                             i32 1,
507                                                             <vscale x 8 x i1> %mask,
508                                                             <vscale x 8 x half> undef)
509  %base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
510  call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
511                                       <vscale x 8 x half>* %base_store,
512                                       i32 1,
513                                       <vscale x 8 x i1> %mask)
514  ret void
515}
516
517define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind #0 {
518; CHECK-LABEL: test_masked_ldst_sv8bf16:
519; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
520; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
521; CHECK-NEXT: ret
522  %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
523  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_load,
524                                                                i32 1,
525                                                                <vscale x 8 x i1> %mask,
526                                                                <vscale x 8 x bfloat> undef)
527  %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
528  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
529                                        <vscale x 8 x bfloat>* %base_store,
530                                        i32 1,
531                                        <vscale x 8 x i1> %mask)
532  ret void
533}
534
535; 8-lane zero/sign extended contiguous loads.
536
537define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
538; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
539; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
540; CHECK-NEXT: ret
541  %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -4
542  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
543                                                          i32 1,
544                                                          <vscale x 8 x i1> %mask,
545                                                          <vscale x 8 x i8> undef)
546  %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
547  ret <vscale x 8 x i16> %ext
548}
549
550define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
551; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
552; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
553; CHECK-NEXT: ret
554  %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -3
555  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
556                                                          i32 1,
557                                                          <vscale x 8 x i1> %mask,
558                                                          <vscale x 8 x i8> undef)
559  %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
560  ret <vscale x 8 x i16> %ext
561}
562
563; 8-lane truncating contiguous stores.
564
565define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, <vscale x 8 x i8> *%base, <vscale x 8 x i1> %mask) nounwind {
566; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
567; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl]
568; CHECK-NEXT: ret
569  %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 3
570  %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
571  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
572                                      <vscale x 8 x i8> *%base_load,
573                                      i32 1,
574                                      <vscale x 8 x i1> %mask)
575  ret void
576}
577
578; 16-lane contiguous load/stores.
579
580define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
581; CHECK-LABEL: test_masked_ldst_sv16i8:
582; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl]
583; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl]
584; CHECK-NEXT: ret
585  %base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
586  %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_load,
587                                                            i32 1,
588                                                            <vscale x 16 x i1> %mask,
589                                                            <vscale x 16 x i8> undef)
590  %base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
591  call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
592                                       <vscale x 16 x i8>* %base_store,
593                                       i32 1,
594                                       <vscale x 16 x i1> %mask)
595  ret void
596}
597
598; 2-element contiguous loads.
599declare <vscale x 2 x i8>  @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
600declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
601declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
602declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
603declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
604declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
605declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
606
607; 4-element contiguous loads.
608declare <vscale x 4 x i8>  @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
609declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
610declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
611declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
612declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
613
614; 8-element contiguous loads.
615declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
616declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
617declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
618declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
619
620; 16-element contiguous loads.
621declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
622
623; 2-element contiguous stores.
624declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
625declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
626declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
627declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
628declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
629declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
630declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
631
632; 4-element contiguous stores.
633declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
634declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
635declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
636declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
637declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
638
639; 8-element contiguous stores.
640declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
641declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
642declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
643declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
644
645; 16-element contiguous stores.
646declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
647
648; +bf16 is required for the bfloat version.
649attributes #0 = { "target-features"="+sve,+bf16" }
650