1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; VLDRB.u32 Qd, [base, offs]
5define arm_aapcs_vfpcc void @ext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
6; CHECK-LABEL: ext_unscaled_i8_i32:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q1, [r1]
9; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
13  %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
14  %t = trunc <4 x i32> %input to <4 x i8>
15  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
16  ret void
17}
18
19; VLDRH.u32 Qd, [base, offs]
20define arm_aapcs_vfpcc void @ext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
21; CHECK-LABEL: ext_unscaled_i16_i32:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vldrw.u32 q1, [r1]
24; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
25; CHECK-NEXT:    bx lr
26entry:
27  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
28  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
29  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
30  %t = trunc <4 x i32> %input to <4 x i16>
31  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
32  ret void
33}
34
35; VSTRW.32 Qd, [base, offs]
36define arm_aapcs_vfpcc void @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
37; CHECK-LABEL: unscaled_i32_i32:
38; CHECK:       @ %bb.0: @ %entry
39; CHECK-NEXT:    vldrw.u32 q1, [r1]
40; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
41; CHECK-NEXT:    bx lr
42entry:
43  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
44  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
45  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
46  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
47  ret void
48}
49
50; VSTRW.32 Qd, [base, offs]
51define arm_aapcs_vfpcc void @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr, <4 x float> %input) {
52; CHECK-LABEL: unscaled_f32_i32:
53; CHECK:       @ %bb.0: @ %entry
54; CHECK-NEXT:    vldrw.u32 q1, [r1]
55; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
56; CHECK-NEXT:    bx lr
57entry:
58  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
59  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
60  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
61  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
62  ret void
63}
64
65; VSTRW.32 Qd, [base, offs.zext]
66define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
67; CHECK-LABEL: unsigned_unscaled_b_i32_i16:
68; CHECK:       @ %bb.0: @ %entry
69; CHECK-NEXT:    vldrh.u32 q1, [r1]
70; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
71; CHECK-NEXT:    bx lr
72entry:
73  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
74  %offs.zext = zext <4 x i16> %offs to <4 x i32>
75  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
76  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
77  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
78  ret void
79}
80
81; VSTRW.32 Qd, [base, offs.sext]
82define arm_aapcs_vfpcc void @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
83; CHECK-LABEL: signed_unscaled_i32_i16:
84; CHECK:       @ %bb.0: @ %entry
85; CHECK-NEXT:    vldrh.s32 q1, [r1]
86; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
87; CHECK-NEXT:    bx lr
88entry:
89  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
90  %offs.sext = sext <4 x i16> %offs to <4 x i32>
91  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
92  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
93  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
94  ret void
95}
96
97; VSTRW.32 Qd, [base, offs.zext]
98define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) {
99; CHECK-LABEL: a_unsigned_unscaled_f32_i16:
100; CHECK:       @ %bb.0: @ %entry
101; CHECK-NEXT:    vldrh.u32 q1, [r1]
102; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
103; CHECK-NEXT:    bx lr
104entry:
105  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
106  %offs.zext = zext <4 x i16> %offs to <4 x i32>
107  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
108  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
109  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
110  ret void
111}
112
113; VSTRW.32 Qd, [base, offs.sext]
114define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) {
115; CHECK-LABEL: b_signed_unscaled_f32_i16:
116; CHECK:       @ %bb.0: @ %entry
117; CHECK-NEXT:    vldrh.s32 q1, [r1]
118; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
119; CHECK-NEXT:    bx lr
120entry:
121  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
122  %offs.sext = sext <4 x i16> %offs to <4 x i32>
123  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
124  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
125  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
126  ret void
127}
128
129; VLDRH.u32 Qd, [base, offs.sext]
130define arm_aapcs_vfpcc void @ext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
131; CHECK-LABEL: ext_signed_unscaled_i16_i16:
132; CHECK:       @ %bb.0: @ %entry
133; CHECK-NEXT:    vldrh.s32 q1, [r1]
134; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
135; CHECK-NEXT:    bx lr
136entry:
137  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
138  %offs.sext = sext <4 x i16> %offs to <4 x i32>
139  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
140  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
141  %t = trunc <4 x i32> %input to <4 x i16>
142  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
143  ret void
144}
145
146; VLDRH.u32 Qd, [base, offs.zext]
147define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
148; CHECK-LABEL: ext_unsigned_unscaled_i16_i16:
149; CHECK:       @ %bb.0: @ %entry
150; CHECK-NEXT:    vldrh.u32 q1, [r1]
151; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
152; CHECK-NEXT:    bx lr
153entry:
154  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
155  %offs.zext = zext <4 x i16> %offs to <4 x i32>
156  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
157  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
158  %t = trunc <4 x i32> %input to <4 x i16>
159  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
160  ret void
161}
162
163; VLDRB.u32 Qd, [base, offs.sext]
164define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
165; CHECK-LABEL: ext_signed_unscaled_i8_i16:
166; CHECK:       @ %bb.0: @ %entry
167; CHECK-NEXT:    vldrh.s32 q1, [r1]
168; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
169; CHECK-NEXT:    bx lr
170entry:
171  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
172  %offs.sext = sext <4 x i16> %offs to <4 x i32>
173  %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
174  %t = trunc <4 x i32> %input to <4 x i8>
175  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
176  ret void
177}
178
179; VLDRB.s32 Qd, [base, offs.zext]
180define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
181; CHECK-LABEL: ext_unsigned_unscaled_i8_i16:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    vldrh.u32 q1, [r1]
184; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
185; CHECK-NEXT:    bx lr
186entry:
187  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
188  %offs.zext = zext <4 x i16> %offs to <4 x i32>
189  %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
190  %t = trunc <4 x i32> %input to <4 x i8>
191  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
192  ret void
193}
194
195; VSTRW.32 Qd, [base, offs.zext]
196define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
197; CHECK-LABEL: unsigned_unscaled_b_i32_i8:
198; CHECK:       @ %bb.0: @ %entry
199; CHECK-NEXT:    vldrb.u32 q1, [r1]
200; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
201; CHECK-NEXT:    bx lr
202entry:
203  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
204  %offs.zext = zext <4 x i8> %offs to <4 x i32>
205  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
206  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
207  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
208  ret void
209}
210
211; VSTRW.32 Qd, [base, offs.sext]
212define arm_aapcs_vfpcc void @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
213; CHECK-LABEL: signed_unscaled_i32_i8:
214; CHECK:       @ %bb.0: @ %entry
215; CHECK-NEXT:    vldrb.s32 q1, [r1]
216; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
217; CHECK-NEXT:    bx lr
218entry:
219  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
220  %offs.sext = sext <4 x i8> %offs to <4 x i32>
221  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
222  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
223  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
224  ret void
225}
226
227; VSTRW.32 Qd, [base, offs.zext]
228define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) {
229; CHECK-LABEL: a_unsigned_unscaled_f32_i8:
230; CHECK:       @ %bb.0: @ %entry
231; CHECK-NEXT:    vldrb.u32 q1, [r1]
232; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
233; CHECK-NEXT:    bx lr
234entry:
235  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
236  %offs.zext = zext <4 x i8> %offs to <4 x i32>
237  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
238  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
239  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
240  ret void
241}
242
243; VSTRW.32 Qd, [base, offs.sext]
244define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) {
245; CHECK-LABEL: b_signed_unscaled_f32_i8:
246; CHECK:       @ %bb.0: @ %entry
247; CHECK-NEXT:    vldrb.s32 q1, [r1]
248; CHECK-NEXT:    vstrw.32 q0, [r0, q1]
249; CHECK-NEXT:    bx lr
250entry:
251  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
252  %offs.sext = sext <4 x i8> %offs to <4 x i32>
253  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
254  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
255  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
256  ret void
257}
258
259; VLDRH.u32 Qd, [base, offs.sext]
260define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
261; CHECK-LABEL: ext_signed_unscaled_i8_i8:
262; CHECK:       @ %bb.0: @ %entry
263; CHECK-NEXT:    vldrb.s32 q1, [r1]
264; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
265; CHECK-NEXT:    bx lr
266entry:
267  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
268  %offs.sext = sext <4 x i8> %offs to <4 x i32>
269  %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
270  %t = trunc <4 x i32> %input to <4 x i8>
271  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
272  ret void
273}
274
275; VLDRH.u32 Qd, [base, offs.zext]
276define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
277; CHECK-LABEL: ext_unsigned_unscaled_i8_i8:
278; CHECK:       @ %bb.0: @ %entry
279; CHECK-NEXT:    vldrb.u32 q1, [r1]
280; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
281; CHECK-NEXT:    bx lr
282entry:
283  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
284  %offs.zext = zext <4 x i8> %offs to <4 x i32>
285  %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
286  %t = trunc <4 x i32> %input to <4 x i8>
287  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
288  ret void
289}
290
291define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
292; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
293; CHECK:       @ %bb.0: @ %entry
294; CHECK-NEXT:    vmov.f32 s1, s2
295; CHECK-NEXT:    vldrb.s32 q2, [r1]
296; CHECK-NEXT:    vmov.f32 s2, s4
297; CHECK-NEXT:    vmov.f32 s3, s6
298; CHECK-NEXT:    vstrw.32 q0, [r0, q2]
299; CHECK-NEXT:    bx lr
300entry:
301  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
302  %offs.sext = sext <4 x i8> %offs to <4 x i32>
303  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
304  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
305  %input.trunc = trunc <4 x i64> %input to <4 x i32>
306  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
307  ret void
308}
309
310define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
311; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
312; CHECK:       @ %bb.0: @ %entry
313; CHECK-NEXT:    vmov.f32 s1, s2
314; CHECK-NEXT:    vldrb.u32 q2, [r1]
315; CHECK-NEXT:    vmov.f32 s2, s4
316; CHECK-NEXT:    vmov.f32 s3, s6
317; CHECK-NEXT:    vstrw.32 q0, [r0, q2]
318; CHECK-NEXT:    bx lr
319entry:
320  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
321  %offs.zext = zext <4 x i8> %offs to <4 x i32>
322  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
323  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
324  %input.trunc = trunc <4 x i64> %input to <4 x i32>
325  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
326  ret void
327}
328
329define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
330; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
331; CHECK:       @ %bb.0: @ %entry
332; CHECK-NEXT:    vldrb.s32 q1, [r1]
333; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
334; CHECK-NEXT:    bx lr
335entry:
336  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
337  %offs.sext = sext <4 x i8> %offs to <4 x i32>
338  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
339  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
340  %input.trunc = trunc <4 x i32> %input to <4 x i16>
341  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
342  ret void
343}
344
345define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
346; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
347; CHECK:       @ %bb.0: @ %entry
348; CHECK-NEXT:    vldrb.u32 q1, [r1]
349; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
350; CHECK-NEXT:    bx lr
351entry:
352  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
353  %offs.zext = zext <4 x i8> %offs to <4 x i32>
354  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
355  %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
356  %input.trunc = trunc <4 x i32> %input to <4 x i16>
357  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
358  ret void
359}
360
361define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
362; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
363; CHECK:       @ %bb.0: @ %entry
364; CHECK-NEXT:    vmov.i32 q1, #0xff
365; CHECK-NEXT:    vldrb.s32 q2, [r1]
366; CHECK-NEXT:    vand q0, q0, q1
367; CHECK-NEXT:    vstrb.32 q0, [r0, q2]
368; CHECK-NEXT:    bx lr
369entry:
370  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
371  %offs.sext = sext <4 x i8> %offs to <4 x i32>
372  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
373  %input.trunc = trunc <4 x i16> %input to <4 x i8>
374  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
375  ret void
376}
377
378define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
379; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
380; CHECK:       @ %bb.0: @ %entry
381; CHECK-NEXT:    vmov.i32 q1, #0xff
382; CHECK-NEXT:    vldrb.u32 q2, [r1]
383; CHECK-NEXT:    vand q0, q0, q1
384; CHECK-NEXT:    vstrb.32 q0, [r0, q2]
385; CHECK-NEXT:    bx lr
386entry:
387  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
388  %offs.zext = zext <4 x i8> %offs to <4 x i32>
389  %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
390  %input.trunc = trunc <4 x i16> %input to <4 x i8>
391  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
392  ret void
393}
394
395declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
396declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
397declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
398declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
399declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
400