1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
4
5define arm_aapcs_vfpcc <16 x i8> @add_int8_t(<16 x i8> %src1, <16 x i8> %src2) {
6; CHECK-LABEL: add_int8_t:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vadd.i8 q0, q0, q1
9; CHECK-NEXT:    bx lr
10entry:
11  %0 = add <16 x i8> %src1, %src2
12  ret <16 x i8> %0
13}
14
15define arm_aapcs_vfpcc <8 x i16> @add_int16_t(<8 x i16> %src1, <8 x i16> %src2) {
16; CHECK-LABEL: add_int16_t:
17; CHECK:       @ %bb.0: @ %entry
18; CHECK-NEXT:    vadd.i16 q0, q0, q1
19; CHECK-NEXT:    bx lr
20entry:
21  %0 = add <8 x i16> %src1, %src2
22  ret <8 x i16> %0
23}
24
25define arm_aapcs_vfpcc <4 x i32> @add_int32_t(<4 x i32> %src1, <4 x i32> %src2) {
26; CHECK-LABEL: add_int32_t:
27; CHECK:       @ %bb.0: @ %entry
28; CHECK-NEXT:    vadd.i32 q0, q0, q1
29; CHECK-NEXT:    bx lr
30entry:
31  %0 = add nsw <4 x i32> %src1, %src2
32  ret <4 x i32> %0
33}
34
35define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
36; CHECK-LABEL: add_int64_t:
37; CHECK:       @ %bb.0: @ %entry
38; CHECK-NEXT:    .save {r4, r5, r7, lr}
39; CHECK-NEXT:    push {r4, r5, r7, lr}
40; CHECK-NEXT:    vmov lr, r12, d3
41; CHECK-NEXT:    vmov r2, r3, d1
42; CHECK-NEXT:    vmov r1, r0, d2
43; CHECK-NEXT:    vmov r4, r5, d0
44; CHECK-NEXT:    adds.w r2, r2, lr
45; CHECK-NEXT:    adc.w r3, r3, r12
46; CHECK-NEXT:    adds r1, r1, r4
47; CHECK-NEXT:    adcs r0, r5
48; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
49; CHECK-NEXT:    vmov q0[3], q0[1], r0, r3
50; CHECK-NEXT:    pop {r4, r5, r7, pc}
51entry:
52  %0 = add nsw <2 x i64> %src1, %src2
53  ret <2 x i64> %0
54}
55
56define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) {
57; CHECK-MVE-LABEL: add_float32_t:
58; CHECK-MVE:       @ %bb.0: @ %entry
59; CHECK-MVE-NEXT:    vadd.f32 s11, s7, s3
60; CHECK-MVE-NEXT:    vadd.f32 s10, s6, s2
61; CHECK-MVE-NEXT:    vadd.f32 s9, s5, s1
62; CHECK-MVE-NEXT:    vadd.f32 s8, s4, s0
63; CHECK-MVE-NEXT:    vmov q0, q2
64; CHECK-MVE-NEXT:    bx lr
65;
66; CHECK-MVEFP-LABEL: add_float32_t:
67; CHECK-MVEFP:       @ %bb.0: @ %entry
68; CHECK-MVEFP-NEXT:    vadd.f32 q0, q1, q0
69; CHECK-MVEFP-NEXT:    bx lr
70entry:
71  %0 = fadd nnan ninf nsz <4 x float> %src2, %src1
72  ret <4 x float> %0
73}
74
75define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
76; CHECK-MVE-LABEL: add_float16_t:
77; CHECK-MVE:       @ %bb.0: @ %entry
78; CHECK-MVE-NEXT:    vmov q2, q0
79; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
80; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
81; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
82; CHECK-MVE-NEXT:    vadd.f16 s12, s2, s0
83; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s8
84; CHECK-MVE-NEXT:    vins.f16 s0, s12
85; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
86; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
87; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s9
88; CHECK-MVE-NEXT:    vins.f16 s1, s12
89; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
90; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
91; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s10
92; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
93; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
94; CHECK-MVE-NEXT:    vins.f16 s2, s12
95; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
96; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
97; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s11
98; CHECK-MVE-NEXT:    vins.f16 s3, s12
99; CHECK-MVE-NEXT:    bx lr
100;
101; CHECK-MVEFP-LABEL: add_float16_t:
102; CHECK-MVEFP:       @ %bb.0: @ %entry
103; CHECK-MVEFP-NEXT:    vadd.f16 q0, q1, q0
104; CHECK-MVEFP-NEXT:    bx lr
105entry:
106  %0 = fadd nnan ninf nsz <8 x half> %src2, %src1
107  ret <8 x half> %0
108}
109
110define arm_aapcs_vfpcc <2 x double> @add_float64_t(<2 x double> %src1, <2 x double> %src2) {
111; CHECK-LABEL: add_float64_t:
112; CHECK:       @ %bb.0: @ %entry
113; CHECK-NEXT:    .save {r7, lr}
114; CHECK-NEXT:    push {r7, lr}
115; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
116; CHECK-NEXT:    vpush {d8, d9, d10, d11}
117; CHECK-NEXT:    vmov q4, q1
118; CHECK-NEXT:    vmov q5, q0
119; CHECK-NEXT:    vmov r0, r1, d9
120; CHECK-NEXT:    vmov r2, r3, d11
121; CHECK-NEXT:    bl __aeabi_dadd
122; CHECK-NEXT:    vmov lr, r12, d8
123; CHECK-NEXT:    vmov r2, r3, d10
124; CHECK-NEXT:    vmov d9, r0, r1
125; CHECK-NEXT:    mov r0, lr
126; CHECK-NEXT:    mov r1, r12
127; CHECK-NEXT:    bl __aeabi_dadd
128; CHECK-NEXT:    vmov d8, r0, r1
129; CHECK-NEXT:    vmov q0, q4
130; CHECK-NEXT:    vpop {d8, d9, d10, d11}
131; CHECK-NEXT:    pop {r7, pc}
132entry:
133  %0 = fadd nnan ninf nsz <2 x double> %src2, %src1
134  ret <2 x double> %0
135}
136
137
138define arm_aapcs_vfpcc <16 x i8> @sub_int8_t(<16 x i8> %src1, <16 x i8> %src2) {
139; CHECK-LABEL: sub_int8_t:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    vsub.i8 q0, q1, q0
142; CHECK-NEXT:    bx lr
143entry:
144  %0 = sub <16 x i8> %src2, %src1
145  ret <16 x i8> %0
146}
147
148define arm_aapcs_vfpcc <8 x i16> @sub_int16_t(<8 x i16> %src1, <8 x i16> %src2) {
149; CHECK-LABEL: sub_int16_t:
150; CHECK:       @ %bb.0: @ %entry
151; CHECK-NEXT:    vsub.i16 q0, q1, q0
152; CHECK-NEXT:    bx lr
153entry:
154  %0 = sub <8 x i16> %src2, %src1
155  ret <8 x i16> %0
156}
157
158define arm_aapcs_vfpcc <4 x i32> @sub_int32_t(<4 x i32> %src1, <4 x i32> %src2) {
159; CHECK-LABEL: sub_int32_t:
160; CHECK:       @ %bb.0: @ %entry
161; CHECK-NEXT:    vsub.i32 q0, q1, q0
162; CHECK-NEXT:    bx lr
163entry:
164  %0 = sub nsw <4 x i32> %src2, %src1
165  ret <4 x i32> %0
166}
167
168define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
169; CHECK-LABEL: sub_int64_t:
170; CHECK:       @ %bb.0: @ %entry
171; CHECK-NEXT:    .save {r4, r5, r7, lr}
172; CHECK-NEXT:    push {r4, r5, r7, lr}
173; CHECK-NEXT:    vmov lr, r12, d1
174; CHECK-NEXT:    vmov r2, r3, d3
175; CHECK-NEXT:    vmov r1, r0, d0
176; CHECK-NEXT:    vmov r4, r5, d2
177; CHECK-NEXT:    subs.w r2, r2, lr
178; CHECK-NEXT:    sbc.w r3, r3, r12
179; CHECK-NEXT:    subs r1, r4, r1
180; CHECK-NEXT:    sbc.w r0, r5, r0
181; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
182; CHECK-NEXT:    vmov q0[3], q0[1], r0, r3
183; CHECK-NEXT:    pop {r4, r5, r7, pc}
184entry:
185  %0 = sub nsw <2 x i64> %src2, %src1
186  ret <2 x i64> %0
187}
188
189define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) {
190; CHECK-MVE-LABEL: sub_float32_t:
191; CHECK-MVE:       @ %bb.0: @ %entry
192; CHECK-MVE-NEXT:    vsub.f32 s11, s7, s3
193; CHECK-MVE-NEXT:    vsub.f32 s10, s6, s2
194; CHECK-MVE-NEXT:    vsub.f32 s9, s5, s1
195; CHECK-MVE-NEXT:    vsub.f32 s8, s4, s0
196; CHECK-MVE-NEXT:    vmov q0, q2
197; CHECK-MVE-NEXT:    bx lr
198;
199; CHECK-MVEFP-LABEL: sub_float32_t:
200; CHECK-MVEFP:       @ %bb.0: @ %entry
201; CHECK-MVEFP-NEXT:    vsub.f32 q0, q1, q0
202; CHECK-MVEFP-NEXT:    bx lr
203entry:
204  %0 = fsub nnan ninf nsz <4 x float> %src2, %src1
205  ret <4 x float> %0
206}
207
208define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
209; CHECK-MVE-LABEL: sub_float16_t:
210; CHECK-MVE:       @ %bb.0: @ %entry
211; CHECK-MVE-NEXT:    vmov q2, q0
212; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
213; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
214; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
215; CHECK-MVE-NEXT:    vsub.f16 s12, s2, s0
216; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s8
217; CHECK-MVE-NEXT:    vins.f16 s0, s12
218; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
219; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
220; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s9
221; CHECK-MVE-NEXT:    vins.f16 s1, s12
222; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
223; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
224; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s10
225; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
226; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
227; CHECK-MVE-NEXT:    vins.f16 s2, s12
228; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
229; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
230; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s11
231; CHECK-MVE-NEXT:    vins.f16 s3, s12
232; CHECK-MVE-NEXT:    bx lr
233;
234; CHECK-MVEFP-LABEL: sub_float16_t:
235; CHECK-MVEFP:       @ %bb.0: @ %entry
236; CHECK-MVEFP-NEXT:    vsub.f16 q0, q1, q0
237; CHECK-MVEFP-NEXT:    bx lr
238entry:
239  %0 = fsub nnan ninf nsz <8 x half> %src2, %src1
240  ret <8 x half> %0
241}
242
243define arm_aapcs_vfpcc <2 x double> @sub_float64_t(<2 x double> %src1, <2 x double> %src2) {
244; CHECK-LABEL: sub_float64_t:
245; CHECK:       @ %bb.0: @ %entry
246; CHECK-NEXT:    .save {r7, lr}
247; CHECK-NEXT:    push {r7, lr}
248; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
249; CHECK-NEXT:    vpush {d8, d9, d10, d11}
250; CHECK-NEXT:    vmov q4, q1
251; CHECK-NEXT:    vmov q5, q0
252; CHECK-NEXT:    vmov r0, r1, d9
253; CHECK-NEXT:    vmov r2, r3, d11
254; CHECK-NEXT:    bl __aeabi_dsub
255; CHECK-NEXT:    vmov lr, r12, d8
256; CHECK-NEXT:    vmov r2, r3, d10
257; CHECK-NEXT:    vmov d9, r0, r1
258; CHECK-NEXT:    mov r0, lr
259; CHECK-NEXT:    mov r1, r12
260; CHECK-NEXT:    bl __aeabi_dsub
261; CHECK-NEXT:    vmov d8, r0, r1
262; CHECK-NEXT:    vmov q0, q4
263; CHECK-NEXT:    vpop {d8, d9, d10, d11}
264; CHECK-NEXT:    pop {r7, pc}
265entry:
266  %0 = fsub nnan ninf nsz <2 x double> %src2, %src1
267  ret <2 x double> %0
268}
269
270
271define arm_aapcs_vfpcc <16 x i8> @mul_int8_t(<16 x i8> %src1, <16 x i8> %src2) {
272; CHECK-LABEL: mul_int8_t:
273; CHECK:       @ %bb.0: @ %entry
274; CHECK-NEXT:    vmul.i8 q0, q0, q1
275; CHECK-NEXT:    bx lr
276entry:
277  %0 = mul <16 x i8> %src1, %src2
278  ret <16 x i8> %0
279}
280
281define arm_aapcs_vfpcc <8 x i16> @mul_int16_t(<8 x i16> %src1, <8 x i16> %src2) {
282; CHECK-LABEL: mul_int16_t:
283; CHECK:       @ %bb.0: @ %entry
284; CHECK-NEXT:    vmul.i16 q0, q0, q1
285; CHECK-NEXT:    bx lr
286entry:
287  %0 = mul <8 x i16> %src1, %src2
288  ret <8 x i16> %0
289}
290
291define arm_aapcs_vfpcc <4 x i32> @mul_int32_t(<4 x i32> %src1, <4 x i32> %src2) {
292; CHECK-LABEL: mul_int32_t:
293; CHECK:       @ %bb.0: @ %entry
294; CHECK-NEXT:    vmul.i32 q0, q0, q1
295; CHECK-NEXT:    bx lr
296entry:
297  %0 = mul nsw <4 x i32> %src1, %src2
298  ret <4 x i32> %0
299}
300
301define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
302; CHECK-LABEL: mul_int64_t:
303; CHECK:       @ %bb.0: @ %entry
304; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
305; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
306; CHECK-NEXT:    vmov r0, r1, d2
307; CHECK-NEXT:    vmov r2, lr, d0
308; CHECK-NEXT:    vmov r4, r5, d3
309; CHECK-NEXT:    umull r12, r3, r2, r0
310; CHECK-NEXT:    mla r1, r2, r1, r3
311; CHECK-NEXT:    vmov r2, r3, d1
312; CHECK-NEXT:    mla r0, lr, r0, r1
313; CHECK-NEXT:    umull r6, r7, r2, r4
314; CHECK-NEXT:    mla r2, r2, r5, r7
315; CHECK-NEXT:    vmov q0[2], q0[0], r12, r6
316; CHECK-NEXT:    mla r2, r3, r4, r2
317; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
318; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
319entry:
320  %0 = mul nsw <2 x i64> %src1, %src2
321  ret <2 x i64> %0
322}
323
324define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
325; CHECK-MVE-LABEL: mul_float16_t:
326; CHECK-MVE:       @ %bb.0: @ %entry
327; CHECK-MVE-NEXT:    vmov q2, q0
328; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
329; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
330; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
331; CHECK-MVE-NEXT:    vmul.f16 s12, s2, s0
332; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s8
333; CHECK-MVE-NEXT:    vins.f16 s0, s12
334; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
335; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
336; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s9
337; CHECK-MVE-NEXT:    vins.f16 s1, s12
338; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
339; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
340; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s10
341; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
342; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
343; CHECK-MVE-NEXT:    vins.f16 s2, s12
344; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
345; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
346; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s11
347; CHECK-MVE-NEXT:    vins.f16 s3, s12
348; CHECK-MVE-NEXT:    bx lr
349;
350; CHECK-MVEFP-LABEL: mul_float16_t:
351; CHECK-MVEFP:       @ %bb.0: @ %entry
352; CHECK-MVEFP-NEXT:    vmul.f16 q0, q1, q0
353; CHECK-MVEFP-NEXT:    bx lr
354entry:
355  %0 = fmul nnan ninf nsz <8 x half> %src2, %src1
356  ret <8 x half> %0
357}
358
359define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) {
360; CHECK-MVE-LABEL: mul_float32_t:
361; CHECK-MVE:       @ %bb.0: @ %entry
362; CHECK-MVE-NEXT:    vmul.f32 s11, s7, s3
363; CHECK-MVE-NEXT:    vmul.f32 s10, s6, s2
364; CHECK-MVE-NEXT:    vmul.f32 s9, s5, s1
365; CHECK-MVE-NEXT:    vmul.f32 s8, s4, s0
366; CHECK-MVE-NEXT:    vmov q0, q2
367; CHECK-MVE-NEXT:    bx lr
368;
369; CHECK-MVEFP-LABEL: mul_float32_t:
370; CHECK-MVEFP:       @ %bb.0: @ %entry
371; CHECK-MVEFP-NEXT:    vmul.f32 q0, q1, q0
372; CHECK-MVEFP-NEXT:    bx lr
373entry:
374  %0 = fmul nnan ninf nsz <4 x float> %src2, %src1
375  ret <4 x float> %0
376}
377
378define arm_aapcs_vfpcc <2 x double> @mul_float64_t(<2 x double> %src1, <2 x double> %src2) {
379; CHECK-LABEL: mul_float64_t:
380; CHECK:       @ %bb.0: @ %entry
381; CHECK-NEXT:    .save {r7, lr}
382; CHECK-NEXT:    push {r7, lr}
383; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
384; CHECK-NEXT:    vpush {d8, d9, d10, d11}
385; CHECK-NEXT:    vmov q4, q1
386; CHECK-NEXT:    vmov q5, q0
387; CHECK-NEXT:    vmov r0, r1, d9
388; CHECK-NEXT:    vmov r2, r3, d11
389; CHECK-NEXT:    bl __aeabi_dmul
390; CHECK-NEXT:    vmov lr, r12, d8
391; CHECK-NEXT:    vmov r2, r3, d10
392; CHECK-NEXT:    vmov d9, r0, r1
393; CHECK-NEXT:    mov r0, lr
394; CHECK-NEXT:    mov r1, r12
395; CHECK-NEXT:    bl __aeabi_dmul
396; CHECK-NEXT:    vmov d8, r0, r1
397; CHECK-NEXT:    vmov q0, q4
398; CHECK-NEXT:    vpop {d8, d9, d10, d11}
399; CHECK-NEXT:    pop {r7, pc}
400entry:
401  %0 = fmul nnan ninf nsz <2 x double> %src2, %src1
402  ret <2 x double> %0
403}
404
405