1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
3
4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
5; WARN-NOT: warning
6
7; 2-lane contiguous load/stores
8
9define void @test_masked_ldst_sv2i8(i8 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
10; CHECK-LABEL: test_masked_ldst_sv2i8:
11; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
12; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
13; CHECK-NEXT: ret
14  %base_i8 = getelementptr i8, i8* %base, i64 %offset
15  %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
16  %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
17                                                          i32 1,
18                                                          <vscale x 2 x i1> %mask,
19                                                          <vscale x 2 x i8> undef)
20  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
21                                      <vscale x 2 x i8>* %base_addr,
22                                      i32 1,
23                                      <vscale x 2 x i1> %mask)
24  ret void
25}
26
27define void @test_masked_ldst_sv2i16(i16 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
28; CHECK-LABEL: test_masked_ldst_sv2i16:
29; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
30; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
31; CHECK-NEXT: ret
32  %base_i16 = getelementptr i16, i16* %base, i64 %offset
33  %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
34  %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
35                                                            i32 1,
36                                                            <vscale x 2 x i1> %mask,
37                                                            <vscale x 2 x i16> undef)
38  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
39                                       <vscale x 2 x i16>* %base_addr,
40                                       i32 1,
41                                       <vscale x 2 x i1> %mask)
42  ret void
43}
44
45define void @test_masked_ldst_sv2i32(i32 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
46; CHECK-LABEL: test_masked_ldst_sv2i32:
47; CHECK-NEXT: ld1sw  { z0.d }, p0/z, [x0, x1, lsl #2]
48; CHECK-NEXT: st1w  { z0.d }, p0, [x0, x1, lsl #2]
49; CHECK-NEXT: ret
50  %base_i32 = getelementptr i32, i32* %base, i64 %offset
51  %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
52  %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
53                                                            i32 1,
54                                                            <vscale x 2 x i1> %mask,
55                                                            <vscale x 2 x i32> undef)
56  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
57                                       <vscale x 2 x i32>* %base_addr,
58                                       i32 1,
59                                       <vscale x 2 x i1> %mask)
60  ret void
61}
62
63define void @test_masked_ldst_sv2i64(i64 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
64; CHECK-LABEL: test_masked_ldst_sv2i64:
65; CHECK-NEXT: ld1d  { z0.d }, p0/z, [x0, x1, lsl #3]
66; CHECK-NEXT: st1d  { z0.d }, p0, [x0, x1, lsl #3]
67; CHECK-NEXT: ret
68  %base_i64 = getelementptr i64, i64* %base, i64 %offset
69  %base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
70  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_addr,
71                                                            i32 1,
72                                                            <vscale x 2 x i1> %mask,
73                                                            <vscale x 2 x i64> undef)
74  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
75                                       <vscale x 2 x i64>* %base_addr,
76                                       i32 1,
77                                       <vscale x 2 x i1> %mask)
78  ret void
79}
80
81define void @test_masked_ldst_sv2f16(half * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
82; CHECK-LABEL: test_masked_ldst_sv2f16:
83; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
84; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
85; CHECK-NEXT: ret
86  %base_half = getelementptr half, half* %base, i64 %offset
87  %base_addr = bitcast half* %base_half to <vscale x 2 x half>*
88  %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_addr,
89                                                             i32 1,
90                                                             <vscale x 2 x i1> %mask,
91                                                             <vscale x 2 x half> undef)
92  call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
93                                       <vscale x 2 x half>* %base_addr,
94                                       i32 1,
95                                       <vscale x 2 x i1> %mask)
96  ret void
97}
98
99define void @test_masked_ldst_sv2f32(float * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
100; CHECK-LABEL: test_masked_ldst_sv2f32:
101; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
102; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
103; CHECK-NEXT: ret
104  %base_float = getelementptr float, float* %base, i64 %offset
105  %base_addr = bitcast float* %base_float to <vscale x 2 x float>*
106  %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_addr,
107                                                              i32 1,
108                                                              <vscale x 2 x i1> %mask,
109                                                              <vscale x 2 x float> undef)
110  call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
111                                       <vscale x 2 x float>* %base_addr,
112                                       i32 1,
113                                       <vscale x 2 x i1> %mask)
114  ret void
115}
116
117define void @test_masked_ldst_sv2f64(double * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
118; CHECK-LABEL: test_masked_ldst_sv2f64:
119; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
120; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
121; CHECK-NEXT: ret
122  %base_double = getelementptr double, double* %base, i64 %offset
123  %base_addr = bitcast double* %base_double to <vscale x 2 x double>*
124  %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_addr,
125                                                               i32 1,
126                                                               <vscale x 2 x i1> %mask,
127                                                               <vscale x 2 x double> undef)
128  call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
129                                       <vscale x 2 x double>* %base_addr,
130                                       i32 1,
131                                       <vscale x 2 x i1> %mask)
132  ret void
133}
134
135; 2-lane zero/sign extended contiguous loads.
136
137define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
138; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
139; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
140; CHECK-NEXT: ret
141  %base_i8 = getelementptr i8, i8* %base, i64 %offset
142  %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
143  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
144                                                          i32 1,
145                                                          <vscale x 2 x i1> %mask,
146                                                          <vscale x 2 x i8> undef)
147  %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
148  ret <vscale x 2 x i64> %ext
149}
150
151define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
152; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
153; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
154; CHECK-NEXT: ret
155  %base_i8 = getelementptr i8, i8* %base, i64 %offset
156  %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
157  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
158                                                          i32 1,
159                                                          <vscale x 2 x i1> %mask,
160                                                          <vscale x 2 x i8> undef)
161  %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
162  ret <vscale x 2 x i64> %ext
163}
164
165define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
166; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
167; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
168; CHECK-NEXT: ret
169  %base_i16 = getelementptr i16, i16* %base, i64 %offset
170  %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
171  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
172                                                            i32 1,
173                                                            <vscale x 2 x i1> %mask,
174                                                            <vscale x 2 x i16> undef)
175  %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
176  ret <vscale x 2 x i64> %ext
177}
178
179define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
180; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
181; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
182; CHECK-NEXT: ret
183  %base_i16 = getelementptr i16, i16* %base, i64 %offset
184  %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
185  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
186                                                            i32 1,
187                                                            <vscale x 2 x i1> %mask,
188                                                            <vscale x 2 x i16> undef)
189  %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
190  ret <vscale x 2 x i64> %ext
191}
192
193
194define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
195; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
196; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
197; CHECK-NEXT: ret
198  %base_i32 = getelementptr i32, i32* %base, i64 %offset
199  %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
200  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
201                                                            i32 1,
202                                                            <vscale x 2 x i1> %mask,
203                                                            <vscale x 2 x i32> undef)
204  %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
205  ret <vscale x 2 x i64> %ext
206}
207
208define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
209; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
210; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
211; CHECK-NEXT: ret
212  %base_i32 = getelementptr i32, i32* %base, i64 %offset
213  %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
214  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
215                                                            i32 1,
216                                                            <vscale x 2 x i1> %mask,
217                                                            <vscale x 2 x i32> undef)
218  %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
219  ret <vscale x 2 x i64> %ext
220}
221
222; 2-lane truncating contiguous stores.
223
224define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, i8 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
225; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
226; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
227; CHECK-NEXT: ret
228  %base_i8 = getelementptr i8, i8* %base, i64 %offset
229  %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
230  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
231  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
232                                      <vscale x 2 x i8> *%base_addr,
233                                      i32 1,
234                                      <vscale x 2 x i1> %mask)
235  ret void
236}
237
238define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, i16 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
239; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
240; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
241; CHECK-NEXT: ret
242  %base_i16 = getelementptr i16, i16* %base, i64 %offset
243  %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
244  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
245  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
246                                       <vscale x 2 x i16> *%base_addr,
247                                       i32 1,
248                                       <vscale x 2 x i1> %mask)
249  ret void
250}
251
252define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, i32 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
253; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
254; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
255; CHECK-NEXT: ret
256  %base_i32 = getelementptr i32, i32* %base, i64 %offset
257  %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
258  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
259  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
260                                       <vscale x 2 x i32> *%base_addr,
261                                       i32 1,
262                                       <vscale x 2 x i1> %mask)
263  ret void
264}
265
266; 4-lane contiguous load/stores.
267
268define void @test_masked_ldst_sv4i8(i8 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
269; CHECK-LABEL: test_masked_ldst_sv4i8:
270; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
271; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
272; CHECK-NEXT: ret
273  %base_i8 = getelementptr i8, i8* %base, i64 %offset
274  %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
275  %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
276                                                          i32 1,
277                                                          <vscale x 4 x i1> %mask,
278                                                          <vscale x 4 x i8> undef)
279  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
280                                      <vscale x 4 x i8>* %base_addr,
281                                      i32 1,
282                                      <vscale x 4 x i1> %mask)
283  ret void
284}
285
286define void @test_masked_ldst_sv4i16(i16 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
287; CHECK-LABEL: test_masked_ldst_sv4i16:
288; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
289; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
290; CHECK-NEXT: ret
291  %base_i16 = getelementptr i16, i16* %base, i64 %offset
292  %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
293  %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
294                                                            i32 1,
295                                                            <vscale x 4 x i1> %mask,
296                                                            <vscale x 4 x i16> undef)
297  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
298                                       <vscale x 4 x i16>* %base_addr,
299                                       i32 1,
300                                       <vscale x 4 x i1> %mask)
301  ret void
302}
303
304define void @test_masked_ldst_sv4i32(i32 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
305; CHECK-LABEL: test_masked_ldst_sv4i32:
306; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
307; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
308; CHECK-NEXT: ret
309  %base_i32 = getelementptr i32, i32* %base, i64 %offset
310  %base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
311  %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_addr,
312                                                            i32 1,
313                                                            <vscale x 4 x i1> %mask,
314                                                            <vscale x 4 x i32> undef)
315  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
316                                       <vscale x 4 x i32>* %base_addr,
317                                       i32 1,
318                                       <vscale x 4 x i1> %mask)
319  ret void
320}
321
322define void @test_masked_ldst_sv4f16(half * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
323; CHECK-LABEL: test_masked_ldst_sv4f16:
324; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
325; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
326; CHECK-NEXT: ret
327  %base_f16 = getelementptr half, half* %base, i64 %offset
328  %base_addr = bitcast half* %base_f16 to <vscale x 4 x half>*
329  %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_addr,
330                                                             i32 1,
331                                                             <vscale x 4 x i1> %mask,
332                                                             <vscale x 4 x half> undef)
333  call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
334                                       <vscale x 4 x half>* %base_addr,
335                                       i32 1,
336                                       <vscale x 4 x i1> %mask)
337  ret void
338}
339
340define void @test_masked_ldst_sv4f32(float * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
341; CHECK-LABEL: test_masked_ldst_sv4f32:
342; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
343; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
344; CHECK-NEXT: ret
345  %base_f32 = getelementptr float, float* %base, i64 %offset
346  %base_addr = bitcast float* %base_f32 to <vscale x 4 x float>*
347  %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_addr,
348                                                              i32 1,
349                                                              <vscale x 4 x i1> %mask,
350                                                              <vscale x 4 x float> undef)
351  call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
352                                       <vscale x 4 x float>* %base_addr,
353                                       i32 1,
354                                       <vscale x 4 x i1> %mask)
355  ret void
356}
357
358; 4-lane zero/sign extended contiguous loads.
359
360define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
361; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
362; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
363; CHECK-NEXT: ret
364  %base_i8 = getelementptr i8, i8* %base, i64 %offset
365  %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
366  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
367                                                          i32 1,
368                                                          <vscale x 4 x i1> %mask,
369                                                          <vscale x 4 x i8> undef)
370  %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
371  ret <vscale x 4 x i32> %ext
372}
373
374define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
375; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
376; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
377; CHECK-NEXT: ret
378  %base_i8 = getelementptr i8, i8* %base, i64 %offset
379  %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
380  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
381                                                          i32 1,
382                                                          <vscale x 4 x i1> %mask,
383                                                          <vscale x 4 x i8> undef)
384  %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
385  ret <vscale x 4 x i32> %ext
386}
387
388define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
389; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
390; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
391; CHECK-NEXT: ret
392  %base_i16 = getelementptr i16, i16* %base, i64 %offset
393  %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
394  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
395                                                            i32 1,
396                                                            <vscale x 4 x i1> %mask,
397                                                            <vscale x 4 x i16> undef)
398  %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
399  ret <vscale x 4 x i32> %ext
400}
401
402define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
403; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
404; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
405; CHECK-NEXT: ret
406  %base_i16 = getelementptr i16, i16* %base, i64 %offset
407  %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
408  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
409                                                            i32 1,
410                                                            <vscale x 4 x i1> %mask,
411                                                            <vscale x 4 x i16> undef)
412  %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
413  ret <vscale x 4 x i32> %ext
414}
415
416; 4-lane truncating contiguous stores.
417
418define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, i8 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
419; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
420; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
421; CHECK-NEXT: ret
422  %base_i8 = getelementptr i8, i8* %base, i64 %offset
423  %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
424  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
425  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
426                                      <vscale x 4 x i8> *%base_addr,
427                                      i32 1,
428                                      <vscale x 4 x i1> %mask)
429  ret void
430}
431
432define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, i16 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
433; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
434; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
435; CHECK-NEXT: ret
436  %base_i16 = getelementptr i16, i16* %base, i64 %offset
437  %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
438  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
439  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
440                                       <vscale x 4 x i16> *%base_addr,
441                                       i32 1,
442                                       <vscale x 4 x i1> %mask)
443  ret void
444}
445
446; 8-lane contiguous load/stores.
447
448define void @test_masked_ldst_sv8i8(i8 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
449; CHECK-LABEL: test_masked_ldst_sv8i8:
450; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
451; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
452; CHECK-NEXT: ret
453  %base_i8 = getelementptr i8, i8* %base, i64 %offset
454  %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
455  %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
456                                                          i32 1,
457                                                          <vscale x 8 x i1> %mask,
458                                                          <vscale x 8 x i8> undef)
459  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
460                                      <vscale x 8 x i8>* %base_addr,
461                                      i32 1,
462                                      <vscale x 8 x i1> %mask)
463  ret void
464}
465
466define void @test_masked_ldst_sv8i16(i16 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
467; CHECK-LABEL: test_masked_ldst_sv8i16:
468; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
469; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
470; CHECK-NEXT: ret
471  %base_i16 = getelementptr i16, i16* %base, i64 %offset
472  %base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
473  %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_addr,
474                                                            i32 1,
475                                                            <vscale x 8 x i1> %mask,
476                                                            <vscale x 8 x i16> undef)
477  call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
478                                       <vscale x 8 x i16>* %base_addr,
479                                       i32 1,
480                                       <vscale x 8 x i1> %mask)
481  ret void
482}
483
484define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
485; CHECK-LABEL: test_masked_ldst_sv8f16:
486; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
487; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
488; CHECK-NEXT: ret
489  %base_f16 = getelementptr half, half* %base, i64 %offset
490  %base_addr = bitcast half* %base_f16 to <vscale x 8 x half>*
491  %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_addr,
492                                                             i32 1,
493                                                             <vscale x 8 x i1> %mask,
494                                                             <vscale x 8 x half> undef)
495  call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
496                                       <vscale x 8 x half>* %base_addr,
497                                       i32 1,
498                                       <vscale x 8 x i1> %mask)
499  ret void
500}
501
502define void @test_masked_ldst_sv8bf16(bfloat * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
503; CHECK-LABEL: test_masked_ldst_sv8bf16:
504; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
505; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
506; CHECK-NEXT: ret
507  %base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset
508  %base_addr = bitcast bfloat* %base_f16 to <vscale x 8 x bfloat>*
509  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_addr,
510                                                               i32 1,
511                                                               <vscale x 8 x i1> %mask,
512                                                               <vscale x 8 x bfloat> undef)
513  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
514                                        <vscale x 8 x bfloat>* %base_addr,
515                                        i32 1,
516                                        <vscale x 8 x i1> %mask)
517  ret void
518}
519
520; 8-lane zero/sign extended contiguous loads.
521
522define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
523; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
524; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
525; CHECK-NEXT: ret
526  %base_i8 = getelementptr i8, i8* %base, i64 %offset
527  %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
528  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
529                                                          i32 1,
530                                                          <vscale x 8 x i1> %mask,
531                                                          <vscale x 8 x i8> undef)
532  %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
533  ret <vscale x 8 x i16> %ext
534}
535
536define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
537; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
538; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
539; CHECK-NEXT: ret
540  %base_i8 = getelementptr i8, i8* %base, i64 %offset
541  %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
542  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
543                                                          i32 1,
544                                                          <vscale x 8 x i1> %mask,
545                                                          <vscale x 8 x i8> undef)
546  %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
547  ret <vscale x 8 x i16> %ext
548}
549
550; 8-lane truncating contiguous stores.
551
552define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, i8 *%base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
553; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
554; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
555; CHECK-NEXT: ret
556  %base_i8 = getelementptr i8, i8* %base, i64 %offset
557  %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
558  %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
559  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
560                                      <vscale x 8 x i8> *%base_addr,
561                                      i32 1,
562                                      <vscale x 8 x i1> %mask)
563  ret void
564}
565
566; 16-lane contiguous load/stores.
567
568define void @test_masked_ldst_sv16i8(i8 * %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
569; CHECK-LABEL: test_masked_ldst_sv16i8:
570; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
571; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
572; CHECK-NEXT: ret
573  %base_i8 = getelementptr i8, i8* %base, i64 %offset
574  %base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
575  %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_addr,
576                                                            i32 1,
577                                                            <vscale x 16 x i1> %mask,
578                                                            <vscale x 16 x i8> undef)
579  call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
580                                       <vscale x 16 x i8>* %base_addr,
581                                       i32 1,
582                                       <vscale x 16 x i1> %mask)
583  ret void
584}
585
586; 2-element contiguous loads.
587declare <vscale x 2 x i8>  @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
588declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
589declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
590declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
591declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
592declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
593declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
594
595; 4-element contiguous loads.
596declare <vscale x 4 x i8>  @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
597declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
598declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
599declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
600declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
601
602; 8-element contiguous loads.
603declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
604declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
605declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
606declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
607
608; 16-element contiguous loads.
609declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
610
611; 2-element contiguous stores.
612declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
613declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
614declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
615declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
616declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
617declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
618declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
619
620; 4-element contiguous stores.
621declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
622declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
623declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
624declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
625declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
626
627; 8-element contiguous stores.
628declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
629declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
630declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
631declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
632
633; 16-element contiguous stores.
634declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
635
636; +bf16 is required for the bfloat version.
637attributes #0 = { "target-features"="+sve,+bf16" }
638