1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C) {
5; CHECK-LABEL: loads_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r6, lr}
8; CHECK-NEXT:    push {r4, r5, r6, lr}
9; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
10; CHECK-NEXT:    vpush {d8, d9, d10, d11}
11; CHECK-NEXT:    vldrw.u32 q1, [r1]
12; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
13; CHECK-NEXT:    vldrw.u32 q5, [r2]
14; CHECK-NEXT:    vmov.f32 s8, s6
15; CHECK-NEXT:    vmov.f32 s10, s7
16; CHECK-NEXT:    vmov.f32 s6, s5
17; CHECK-NEXT:    vand q2, q2, q0
18; CHECK-NEXT:    vand q0, q1, q0
19; CHECK-NEXT:    vldrw.u32 q1, [r0]
20; CHECK-NEXT:    vmov r4, r1, d4
21; CHECK-NEXT:    vmov.f32 s12, s6
22; CHECK-NEXT:    vmov.f32 s14, s7
23; CHECK-NEXT:    vmov r5, s12
24; CHECK-NEXT:    vmov.f32 s16, s22
25; CHECK-NEXT:    vmov.f32 s18, s23
26; CHECK-NEXT:    vmov r3, lr, d0
27; CHECK-NEXT:    vmov.f32 s6, s5
28; CHECK-NEXT:    vmov r0, r12, d5
29; CHECK-NEXT:    vmov.f32 s8, s20
30; CHECK-NEXT:    vmov.f32 s10, s21
31; CHECK-NEXT:    adds r2, r5, r4
32; CHECK-NEXT:    vmov r4, s16
33; CHECK-NEXT:    asr.w r6, r5, #31
34; CHECK-NEXT:    adcs r1, r6
35; CHECK-NEXT:    asrl r2, r1, r4
36; CHECK-NEXT:    vmov r1, s4
37; CHECK-NEXT:    adds r6, r1, r3
38; CHECK-NEXT:    vmov r3, s8
39; CHECK-NEXT:    asr.w r4, r1, #31
40; CHECK-NEXT:    adc.w r1, r4, lr
41; CHECK-NEXT:    asrl r6, r1, r3
42; CHECK-NEXT:    vmov r5, r4, d1
43; CHECK-NEXT:    vmov r1, s14
44; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
45; CHECK-NEXT:    adds r0, r0, r1
46; CHECK-NEXT:    asr.w r3, r1, #31
47; CHECK-NEXT:    adc.w r1, r3, r12
48; CHECK-NEXT:    vmov r3, s18
49; CHECK-NEXT:    asrl r0, r1, r3
50; CHECK-NEXT:    vmov r1, s6
51; CHECK-NEXT:    adds r6, r1, r5
52; CHECK-NEXT:    asr.w r2, r1, #31
53; CHECK-NEXT:    adc.w r1, r2, r4
54; CHECK-NEXT:    vmov r2, s10
55; CHECK-NEXT:    asrl r6, r1, r2
56; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
57; CHECK-NEXT:    vpop {d8, d9, d10, d11}
58; CHECK-NEXT:    pop {r4, r5, r6, pc}
59entry:
60  %a = load <4 x i32>, <4 x i32> *%A, align 4
61  %b = load <4 x i32>, <4 x i32> *%B, align 4
62  %c = load <4 x i32>, <4 x i32> *%C, align 4
63  %sa = sext <4 x i32> %a to <4 x i64>
64  %sb = zext <4 x i32> %b to <4 x i64>
65  %sc = zext <4 x i32> %c to <4 x i64>
66  %add = add <4 x i64> %sa, %sb
67  %sh = ashr <4 x i64> %add, %sc
68  %t = trunc <4 x i64> %sh to <4 x i32>
69  ret <4 x i32> %t
70}
71
72define arm_aapcs_vfpcc <8 x i16> @loads_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C) {
73; CHECK-LABEL: loads_i16:
74; CHECK:       @ %bb.0: @ %entry
75; CHECK-NEXT:    vldrw.u32 q0, [r1]
76; CHECK-NEXT:    vldrw.u32 q2, [r0]
77; CHECK-NEXT:    vmovlb.s16 q1, q0
78; CHECK-NEXT:    vmovlb.s16 q3, q2
79; CHECK-NEXT:    vmovlt.s16 q0, q0
80; CHECK-NEXT:    vmovlt.s16 q2, q2
81; CHECK-NEXT:    vadd.i32 q0, q2, q0
82; CHECK-NEXT:    vldrw.u32 q2, [r2]
83; CHECK-NEXT:    vadd.i32 q1, q3, q1
84; CHECK-NEXT:    vmovlt.u16 q3, q2
85; CHECK-NEXT:    vneg.s32 q3, q3
86; CHECK-NEXT:    vshl.s32 q3, q0, q3
87; CHECK-NEXT:    vmovlb.u16 q0, q2
88; CHECK-NEXT:    vneg.s32 q0, q0
89; CHECK-NEXT:    vshl.s32 q0, q1, q0
90; CHECK-NEXT:    vmovnt.i32 q0, q3
91; CHECK-NEXT:    bx lr
92entry:
93  %a = load <8 x i16>, <8 x i16> *%A, align 4
94  %b = load <8 x i16>, <8 x i16> *%B, align 4
95  %c = load <8 x i16>, <8 x i16> *%C, align 4
96  %sa = sext <8 x i16> %a to <8 x i32>
97  %sb = sext <8 x i16> %b to <8 x i32>
98  %sc = zext <8 x i16> %c to <8 x i32>
99  %add = add <8 x i32> %sa, %sb
100  %sh = ashr <8 x i32> %add, %sc
101  %t = trunc <8 x i32> %sh to <8 x i16>
102  ret <8 x i16> %t
103}
104
105define arm_aapcs_vfpcc <16 x i8> @loads_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C) {
106; CHECK-LABEL: loads_i8:
107; CHECK:       @ %bb.0: @ %entry
108; CHECK-NEXT:    vldrw.u32 q0, [r1]
109; CHECK-NEXT:    vldrw.u32 q2, [r0]
110; CHECK-NEXT:    vmovlb.s8 q1, q0
111; CHECK-NEXT:    vmovlb.s8 q3, q2
112; CHECK-NEXT:    vmovlt.s8 q0, q0
113; CHECK-NEXT:    vmovlt.s8 q2, q2
114; CHECK-NEXT:    vadd.i16 q0, q2, q0
115; CHECK-NEXT:    vldrw.u32 q2, [r2]
116; CHECK-NEXT:    vadd.i16 q1, q3, q1
117; CHECK-NEXT:    vmovlt.u8 q3, q2
118; CHECK-NEXT:    vneg.s16 q3, q3
119; CHECK-NEXT:    vshl.s16 q3, q0, q3
120; CHECK-NEXT:    vmovlb.u8 q0, q2
121; CHECK-NEXT:    vneg.s16 q0, q0
122; CHECK-NEXT:    vshl.s16 q0, q1, q0
123; CHECK-NEXT:    vmovnt.i16 q0, q3
124; CHECK-NEXT:    bx lr
125entry:
126  %a = load <16 x i8>, <16 x i8> *%A, align 4
127  %b = load <16 x i8>, <16 x i8> *%B, align 4
128  %c = load <16 x i8>, <16 x i8> *%C, align 4
129  %sa = sext <16 x i8> %a to <16 x i16>
130  %sb = sext <16 x i8> %b to <16 x i16>
131  %sc = zext <16 x i8> %c to <16 x i16>
132  %add = add <16 x i16> %sa, %sb
133  %sh = ashr <16 x i16> %add, %sc
134  %t = trunc <16 x i16> %sh to <16 x i8>
135  ret <16 x i8> %t
136}
137
138define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
139; CHECK-LABEL: load_store_i32:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
142; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
143; CHECK-NEXT:    .pad #4
144; CHECK-NEXT:    sub sp, #4
145; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
146; CHECK-NEXT:    vpush {d8, d9, d10, d11}
147; CHECK-NEXT:    vldrw.u32 q2, [r1]
148; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
149; CHECK-NEXT:    vldrw.u32 q5, [r2]
150; CHECK-NEXT:    vmov.f32 s4, s10
151; CHECK-NEXT:    vmov.f32 s6, s11
152; CHECK-NEXT:    vmov.f32 s10, s9
153; CHECK-NEXT:    vand q1, q1, q0
154; CHECK-NEXT:    vand q2, q2, q0
155; CHECK-NEXT:    vldrw.u32 q0, [r0]
156; CHECK-NEXT:    vmov r5, r1, d2
157; CHECK-NEXT:    vmov.f32 s12, s2
158; CHECK-NEXT:    vmov.f32 s14, s3
159; CHECK-NEXT:    vmov r6, s12
160; CHECK-NEXT:    vmov.f32 s16, s22
161; CHECK-NEXT:    vmov.f32 s18, s23
162; CHECK-NEXT:    vmov r4, lr, d4
163; CHECK-NEXT:    vmov.f32 s2, s1
164; CHECK-NEXT:    vmov r0, r12, d3
165; CHECK-NEXT:    vmov.f32 s4, s20
166; CHECK-NEXT:    vmov.f32 s6, s21
167; CHECK-NEXT:    adds r2, r6, r5
168; CHECK-NEXT:    vmov r5, s16
169; CHECK-NEXT:    asr.w r7, r6, #31
170; CHECK-NEXT:    adcs r1, r7
171; CHECK-NEXT:    asrl r2, r1, r5
172; CHECK-NEXT:    vmov r7, s4
173; CHECK-NEXT:    vmov r1, s0
174; CHECK-NEXT:    adds r4, r4, r1
175; CHECK-NEXT:    asr.w r5, r1, #31
176; CHECK-NEXT:    adc.w r1, r5, lr
177; CHECK-NEXT:    asrl r4, r1, r7
178; CHECK-NEXT:    vmov r6, r5, d5
179; CHECK-NEXT:    vmov r1, s14
180; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
181; CHECK-NEXT:    adds r0, r0, r1
182; CHECK-NEXT:    asr.w r7, r1, #31
183; CHECK-NEXT:    adc.w r1, r7, r12
184; CHECK-NEXT:    vmov r7, s18
185; CHECK-NEXT:    asrl r0, r1, r7
186; CHECK-NEXT:    vmov r1, s2
187; CHECK-NEXT:    adds r6, r6, r1
188; CHECK-NEXT:    asr.w r2, r1, #31
189; CHECK-NEXT:    adc.w r1, r2, r5
190; CHECK-NEXT:    vmov r2, s6
191; CHECK-NEXT:    asrl r6, r1, r2
192; CHECK-NEXT:    vmov q2[3], q2[1], r6, r0
193; CHECK-NEXT:    vstrw.32 q2, [r3]
194; CHECK-NEXT:    vpop {d8, d9, d10, d11}
195; CHECK-NEXT:    add sp, #4
196; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
197entry:
198  %a = load <4 x i32>, <4 x i32> *%A, align 4
199  %b = load <4 x i32>, <4 x i32> *%B, align 4
200  %c = load <4 x i32>, <4 x i32> *%C, align 4
201  %sa = sext <4 x i32> %a to <4 x i64>
202  %sb = zext <4 x i32> %b to <4 x i64>
203  %sc = zext <4 x i32> %c to <4 x i64>
204  %add = add <4 x i64> %sa, %sb
205  %sh = ashr <4 x i64> %add, %sc
206  %t = trunc <4 x i64> %sh to <4 x i32>
207  store <4 x i32> %t, <4 x i32> *%D, align 4
208  ret void
209}
210
211define arm_aapcs_vfpcc void @load_store_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C, <8 x i16> *%D) {
212; CHECK-LABEL: load_store_i16:
213; CHECK:       @ %bb.0: @ %entry
214; CHECK-NEXT:    vldrh.s32 q0, [r1, #8]
215; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
216; CHECK-NEXT:    vldrh.s32 q2, [r0]
217; CHECK-NEXT:    vadd.i32 q0, q1, q0
218; CHECK-NEXT:    vldrh.u32 q1, [r2, #8]
219; CHECK-NEXT:    vneg.s32 q1, q1
220; CHECK-NEXT:    vshl.s32 q0, q0, q1
221; CHECK-NEXT:    vldrh.s32 q1, [r1]
222; CHECK-NEXT:    vadd.i32 q1, q2, q1
223; CHECK-NEXT:    vldrh.u32 q2, [r2]
224; CHECK-NEXT:    vstrh.32 q0, [r3, #8]
225; CHECK-NEXT:    vneg.s32 q2, q2
226; CHECK-NEXT:    vshl.s32 q1, q1, q2
227; CHECK-NEXT:    vstrh.32 q1, [r3]
228; CHECK-NEXT:    bx lr
229entry:
230  %a = load <8 x i16>, <8 x i16> *%A, align 4
231  %b = load <8 x i16>, <8 x i16> *%B, align 4
232  %c = load <8 x i16>, <8 x i16> *%C, align 4
233  %sa = sext <8 x i16> %a to <8 x i32>
234  %sb = sext <8 x i16> %b to <8 x i32>
235  %sc = zext <8 x i16> %c to <8 x i32>
236  %add = add <8 x i32> %sa, %sb
237  %sh = ashr <8 x i32> %add, %sc
238  %t = trunc <8 x i32> %sh to <8 x i16>
239  store <8 x i16> %t, <8 x i16> *%D, align 4
240  ret void
241}
242
243define arm_aapcs_vfpcc void @load_store_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C, <16 x i8> *%D) {
244; CHECK-LABEL: load_store_i8:
245; CHECK:       @ %bb.0: @ %entry
246; CHECK-NEXT:    vldrb.s16 q0, [r1, #8]
247; CHECK-NEXT:    vldrb.s16 q1, [r0, #8]
248; CHECK-NEXT:    vldrb.s16 q2, [r0]
249; CHECK-NEXT:    vadd.i16 q0, q1, q0
250; CHECK-NEXT:    vldrb.u16 q1, [r2, #8]
251; CHECK-NEXT:    vneg.s16 q1, q1
252; CHECK-NEXT:    vshl.s16 q0, q0, q1
253; CHECK-NEXT:    vldrb.s16 q1, [r1]
254; CHECK-NEXT:    vadd.i16 q1, q2, q1
255; CHECK-NEXT:    vldrb.u16 q2, [r2]
256; CHECK-NEXT:    vstrb.16 q0, [r3, #8]
257; CHECK-NEXT:    vneg.s16 q2, q2
258; CHECK-NEXT:    vshl.s16 q1, q1, q2
259; CHECK-NEXT:    vstrb.16 q1, [r3]
260; CHECK-NEXT:    bx lr
261entry:
262  %a = load <16 x i8>, <16 x i8> *%A, align 4
263  %b = load <16 x i8>, <16 x i8> *%B, align 4
264  %c = load <16 x i8>, <16 x i8> *%C, align 4
265  %sa = sext <16 x i8> %a to <16 x i16>
266  %sb = sext <16 x i8> %b to <16 x i16>
267  %sc = zext <16 x i8> %c to <16 x i16>
268  %add = add <16 x i16> %sa, %sb
269  %sh = ashr <16 x i16> %add, %sc
270  %t = trunc <16 x i16> %sh to <16 x i8>
271  store <16 x i8> %t, <16 x i8> *%D, align 4
272  ret void
273}
274
275
276define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
277; CHECK-LABEL: load_one_store_i32:
278; CHECK:       @ %bb.0: @ %entry
279; CHECK-NEXT:    .save {r4, r5, r7, lr}
280; CHECK-NEXT:    push {r4, r5, r7, lr}
281; CHECK-NEXT:    vldrw.u32 q0, [r0]
282; CHECK-NEXT:    vmov.f32 s4, s2
283; CHECK-NEXT:    vmov.f32 s6, s3
284; CHECK-NEXT:    vmov.f32 s2, s1
285; CHECK-NEXT:    vmov r2, s6
286; CHECK-NEXT:    adds.w r12, r2, r2
287; CHECK-NEXT:    asr.w r3, r2, #31
288; CHECK-NEXT:    adc.w r7, r3, r2, asr #31
289; CHECK-NEXT:    vmov r3, s4
290; CHECK-NEXT:    asrl r12, r7, r2
291; CHECK-NEXT:    adds r0, r3, r3
292; CHECK-NEXT:    asr.w r5, r3, #31
293; CHECK-NEXT:    adc.w r5, r5, r3, asr #31
294; CHECK-NEXT:    asrl r0, r5, r3
295; CHECK-NEXT:    vmov r3, s0
296; CHECK-NEXT:    adds r4, r3, r3
297; CHECK-NEXT:    asr.w r5, r3, #31
298; CHECK-NEXT:    adc.w r5, r5, r3, asr #31
299; CHECK-NEXT:    asrl r4, r5, r3
300; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
301; CHECK-NEXT:    vmov r0, s2
302; CHECK-NEXT:    adds r4, r0, r0
303; CHECK-NEXT:    asr.w r2, r0, #31
304; CHECK-NEXT:    adc.w r3, r2, r0, asr #31
305; CHECK-NEXT:    asrl r4, r3, r0
306; CHECK-NEXT:    vmov q1[3], q1[1], r4, r12
307; CHECK-NEXT:    vstrw.32 q1, [r1]
308; CHECK-NEXT:    pop {r4, r5, r7, pc}
309entry:
310  %a = load <4 x i32>, <4 x i32> *%A, align 4
311  %sa = sext <4 x i32> %a to <4 x i64>
312  %add = add <4 x i64> %sa, %sa
313  %sh = ashr <4 x i64> %add, %sa
314  %t = trunc <4 x i64> %sh to <4 x i32>
315  store <4 x i32> %t, <4 x i32> *%D, align 4
316  ret void
317}
318
319define arm_aapcs_vfpcc void @load_one_store_i16(<8 x i16> *%A, <8 x i16> *%D) {
320; CHECK-LABEL: load_one_store_i16:
321; CHECK:       @ %bb.0: @ %entry
322; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
323; CHECK-NEXT:    vneg.s32 q1, q0
324; CHECK-NEXT:    vadd.i32 q0, q0, q0
325; CHECK-NEXT:    vshl.s32 q0, q0, q1
326; CHECK-NEXT:    vldrh.s32 q1, [r0]
327; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
328; CHECK-NEXT:    vneg.s32 q2, q1
329; CHECK-NEXT:    vadd.i32 q1, q1, q1
330; CHECK-NEXT:    vshl.s32 q1, q1, q2
331; CHECK-NEXT:    vstrh.32 q1, [r1]
332; CHECK-NEXT:    bx lr
333entry:
334  %a = load <8 x i16>, <8 x i16> *%A, align 4
335  %sa = sext <8 x i16> %a to <8 x i32>
336  %add = add <8 x i32> %sa, %sa
337  %sh = ashr <8 x i32> %add, %sa
338  %t = trunc <8 x i32> %sh to <8 x i16>
339  store <8 x i16> %t, <8 x i16> *%D, align 4
340  ret void
341}
342
343define arm_aapcs_vfpcc void @load_one_store_i8(<16 x i8> *%A, <16 x i8> *%D) {
344; CHECK-LABEL: load_one_store_i8:
345; CHECK:       @ %bb.0: @ %entry
346; CHECK-NEXT:    vldrb.s16 q0, [r0, #8]
347; CHECK-NEXT:    vneg.s16 q1, q0
348; CHECK-NEXT:    vadd.i16 q0, q0, q0
349; CHECK-NEXT:    vshl.s16 q0, q0, q1
350; CHECK-NEXT:    vldrb.s16 q1, [r0]
351; CHECK-NEXT:    vstrb.16 q0, [r1, #8]
352; CHECK-NEXT:    vneg.s16 q2, q1
353; CHECK-NEXT:    vadd.i16 q1, q1, q1
354; CHECK-NEXT:    vshl.s16 q1, q1, q2
355; CHECK-NEXT:    vstrb.16 q1, [r1]
356; CHECK-NEXT:    bx lr
357entry:
358  %a = load <16 x i8>, <16 x i8> *%A, align 4
359  %sa = sext <16 x i8> %a to <16 x i16>
360  %add = add <16 x i16> %sa, %sa
361  %sh = ashr <16 x i16> %add, %sa
362  %t = trunc <16 x i16> %sh to <16 x i8>
363  store <16 x i8> %t, <16 x i8> *%D, align 4
364  ret void
365}
366
367
368define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x i32> *%D) {
369; CHECK-LABEL: mul_i32:
370; CHECK:       @ %bb.0: @ %entry
371; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
372; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
373; CHECK-NEXT:    vldrw.u32 q0, [r1]
374; CHECK-NEXT:    vldrw.u32 q1, [r0]
375; CHECK-NEXT:    ldr.w lr, [sp, #20]
376; CHECK-NEXT:    vmov.f32 s8, s0
377; CHECK-NEXT:    vmov.f32 s12, s4
378; CHECK-NEXT:    vmov.f32 s14, s5
379; CHECK-NEXT:    vmov.f32 s10, s1
380; CHECK-NEXT:    vmov r5, s12
381; CHECK-NEXT:    vmov r1, s14
382; CHECK-NEXT:    vmov r0, s10
383; CHECK-NEXT:    smull r12, r3, r1, r0
384; CHECK-NEXT:    vmov r0, s8
385; CHECK-NEXT:    vmov.f32 s8, s2
386; CHECK-NEXT:    vmov.f32 s10, s3
387; CHECK-NEXT:    vmov.f32 s0, s6
388; CHECK-NEXT:    asrl r12, r3, r2
389; CHECK-NEXT:    vmov.f32 s2, s7
390; CHECK-NEXT:    vmullb.s32 q1, q0, q2
391; CHECK-NEXT:    vmov r6, r1, d2
392; CHECK-NEXT:    vmov r4, r7, d3
393; CHECK-NEXT:    asrl r6, r1, r2
394; CHECK-NEXT:    asrl r4, r7, r2
395; CHECK-NEXT:    smull r0, r5, r5, r0
396; CHECK-NEXT:    asrl r0, r5, r2
397; CHECK-NEXT:    vmov q0[2], q0[0], r0, r6
398; CHECK-NEXT:    vmov q0[3], q0[1], r12, r4
399; CHECK-NEXT:    vstrw.32 q0, [lr]
400; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
401entry:
402  %a = load <4 x i32>, <4 x i32> *%A, align 4
403  %b = load <4 x i32>, <4 x i32> *%B, align 4
404  %i = insertelement <4 x i64> undef, i64 %C, i32 0
405  %c = shufflevector <4 x i64> %i, <4 x i64> undef, <4 x i32> zeroinitializer
406  %sa = sext <4 x i32> %a to <4 x i64>
407  %sb = sext <4 x i32> %b to <4 x i64>
408  %add = mul <4 x i64> %sa, %sb
409  %sh = ashr <4 x i64> %add, %c
410  %t = trunc <4 x i64> %sh to <4 x i32>
411  store <4 x i32> %t, <4 x i32> *%D, align 4
412  ret void
413}
414
415define arm_aapcs_vfpcc void @mul_i16(<8 x i16> *%A, <8 x i16> *%B, i32 %C, <8 x i16> *%D) {
416; CHECK-LABEL: mul_i16:
417; CHECK:       @ %bb.0: @ %entry
418; CHECK-NEXT:    vldrw.u32 q0, [r1]
419; CHECK-NEXT:    vldrw.u32 q1, [r0]
420; CHECK-NEXT:    rsbs r2, r2, #0
421; CHECK-NEXT:    vmullt.s16 q2, q1, q0
422; CHECK-NEXT:    vmullb.s16 q0, q1, q0
423; CHECK-NEXT:    vshl.s32 q2, r2
424; CHECK-NEXT:    vshl.s32 q0, r2
425; CHECK-NEXT:    vmovnt.i32 q0, q2
426; CHECK-NEXT:    vstrw.32 q0, [r3]
427; CHECK-NEXT:    bx lr
428entry:
429  %a = load <8 x i16>, <8 x i16> *%A, align 4
430  %b = load <8 x i16>, <8 x i16> *%B, align 4
431  %i = insertelement <8 x i32> undef, i32 %C, i32 0
432  %c = shufflevector <8 x i32> %i, <8 x i32> undef, <8 x i32> zeroinitializer
433  %sa = sext <8 x i16> %a to <8 x i32>
434  %sb = sext <8 x i16> %b to <8 x i32>
435  %add = mul <8 x i32> %sa, %sb
436  %sh = ashr <8 x i32> %add, %c
437  %t = trunc <8 x i32> %sh to <8 x i16>
438  store <8 x i16> %t, <8 x i16> *%D, align 4
439  ret void
440}
441
442define arm_aapcs_vfpcc void @mul_i8(<16 x i8> *%A, <16 x i8> *%B, i16 %C, <16 x i8> *%D) {
443; CHECK-LABEL: mul_i8:
444; CHECK:       @ %bb.0: @ %entry
445; CHECK-NEXT:    vldrw.u32 q0, [r1]
446; CHECK-NEXT:    vldrw.u32 q1, [r0]
447; CHECK-NEXT:    rsbs r2, r2, #0
448; CHECK-NEXT:    vmullt.s8 q2, q1, q0
449; CHECK-NEXT:    vmullb.s8 q0, q1, q0
450; CHECK-NEXT:    vshl.s16 q2, r2
451; CHECK-NEXT:    vshl.s16 q0, r2
452; CHECK-NEXT:    vmovnt.i16 q0, q2
453; CHECK-NEXT:    vstrw.32 q0, [r3]
454; CHECK-NEXT:    bx lr
455entry:
456  %a = load <16 x i8>, <16 x i8> *%A, align 4
457  %b = load <16 x i8>, <16 x i8> *%B, align 4
458  %i = insertelement <16 x i16> undef, i16 %C, i32 0
459  %c = shufflevector <16 x i16> %i, <16 x i16> undef, <16 x i32> zeroinitializer
460  %sa = sext <16 x i8> %a to <16 x i16>
461  %sb = sext <16 x i8> %b to <16 x i16>
462  %add = mul <16 x i16> %sa, %sb
463  %sh = ashr <16 x i16> %add, %c
464  %t = trunc <16 x i16> %sh to <16 x i8>
465  store <16 x i8> %t, <16 x i8> *%D, align 4
466  ret void
467}
468