1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
7; CHECK-LABEL: vld3_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    .save {r7, lr}
10; CHECK-NEXT:    push {r7, lr}
11; CHECK-NEXT:    vldrw.u32 q0, [r0]
12; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
13; CHECK-NEXT:    vmov.f64 d2, d0
14; CHECK-NEXT:    vmov.f32 s6, s3
15; CHECK-NEXT:    vmov r12, lr, d0
16; CHECK-NEXT:    vmov r3, s6
17; CHECK-NEXT:    add r2, r3
18; CHECK-NEXT:    add.w r3, r12, lr
19; CHECK-NEXT:    add r0, r2
20; CHECK-NEXT:    vmov r2, s2
21; CHECK-NEXT:    add r2, r3
22; CHECK-NEXT:    strd r2, r0, [r1]
23; CHECK-NEXT:    pop {r7, pc}
24entry:
25  %l1 = load <6 x i32>, <6 x i32>* %src, align 4
26  %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
27  %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
28  %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
29  %a1 = add <2 x i32> %s1, %s2
30  %a = add <2 x i32> %a1, %s3
31  store <2 x i32> %a, <2 x i32> *%dst
32  ret void
33}
34
35define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
36; CHECK-LABEL: vld3_v4i32:
37; CHECK:       @ %bb.0: @ %entry
38; CHECK-NEXT:    .vsave {d8, d9}
39; CHECK-NEXT:    vpush {d8, d9}
40; CHECK-NEXT:    vldrw.u32 q1, [r0]
41; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
42; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
43; CHECK-NEXT:    vmov.f64 d4, d2
44; CHECK-NEXT:    vmov.f32 s12, s5
45; CHECK-NEXT:    vmov.f32 s9, s7
46; CHECK-NEXT:    vmov.f32 s13, s0
47; CHECK-NEXT:    vmov.f32 s10, s2
48; CHECK-NEXT:    vmov.f32 s14, s3
49; CHECK-NEXT:    vmov.f32 s0, s6
50; CHECK-NEXT:    vmov.f32 s2, s16
51; CHECK-NEXT:    vmov.f32 s15, s18
52; CHECK-NEXT:    vmov.f32 s11, s17
53; CHECK-NEXT:    vadd.i32 q2, q2, q3
54; CHECK-NEXT:    vmov.f32 s3, s19
55; CHECK-NEXT:    vadd.i32 q0, q2, q0
56; CHECK-NEXT:    vstrw.32 q0, [r1]
57; CHECK-NEXT:    vpop {d8, d9}
58; CHECK-NEXT:    bx lr
59entry:
60  %l1 = load <12 x i32>, <12 x i32>* %src, align 4
61  %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
62  %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
63  %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
64  %a1 = add <4 x i32> %s1, %s2
65  %a = add <4 x i32> %a1, %s3
66  store <4 x i32> %a, <4 x i32> *%dst
67  ret void
68}
69
70define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
71; CHECK-LABEL: vld3_v8i32:
72; CHECK:       @ %bb.0: @ %entry
73; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
74; CHECK-NEXT:    vpush {d8, d9, d10, d11}
75; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
76; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
77; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
78; CHECK-NEXT:    vmov.f64 d4, d2
79; CHECK-NEXT:    vmov.f32 s12, s5
80; CHECK-NEXT:    vmov.f32 s9, s7
81; CHECK-NEXT:    vmov.f32 s13, s0
82; CHECK-NEXT:    vmov.f32 s10, s2
83; CHECK-NEXT:    vmov.f32 s14, s3
84; CHECK-NEXT:    vmov.f32 s0, s6
85; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
86; CHECK-NEXT:    vmov.f32 s2, s16
87; CHECK-NEXT:    vmov.f32 s15, s18
88; CHECK-NEXT:    vmov.f32 s11, s17
89; CHECK-NEXT:    vadd.i32 q2, q2, q3
90; CHECK-NEXT:    vmov.f32 s3, s19
91; CHECK-NEXT:    vadd.i32 q0, q2, q0
92; CHECK-NEXT:    vldrw.u32 q2, [r0]
93; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
94; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
95; CHECK-NEXT:    vmov.f32 s16, s9
96; CHECK-NEXT:    vmov.f64 d10, d4
97; CHECK-NEXT:    vmov.f32 s17, s4
98; CHECK-NEXT:    vmov.f32 s21, s11
99; CHECK-NEXT:    vmov.f32 s18, s7
100; CHECK-NEXT:    vmov.f32 s22, s6
101; CHECK-NEXT:    vmov.f32 s4, s10
102; CHECK-NEXT:    vmov.f32 s6, s12
103; CHECK-NEXT:    vmov.f32 s19, s14
104; CHECK-NEXT:    vmov.f32 s23, s13
105; CHECK-NEXT:    vadd.i32 q4, q5, q4
106; CHECK-NEXT:    vmov.f32 s7, s15
107; CHECK-NEXT:    vadd.i32 q1, q4, q1
108; CHECK-NEXT:    vstrw.32 q1, [r1]
109; CHECK-NEXT:    vpop {d8, d9, d10, d11}
110; CHECK-NEXT:    bx lr
111entry:
112  %l1 = load <24 x i32>, <24 x i32>* %src, align 4
113  %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
114  %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
115  %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
116  %a1 = add <8 x i32> %s1, %s2
117  %a = add <8 x i32> %a1, %s3
118  store <8 x i32> %a, <8 x i32> *%dst
119  ret void
120}
121
122define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
123; CHECK-LABEL: vld3_v16i32:
124; CHECK:       @ %bb.0: @ %entry
125; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
126; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
127; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
128; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
129; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
130; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
131; CHECK-NEXT:    vmov.f64 d4, d2
132; CHECK-NEXT:    vmov.f32 s12, s5
133; CHECK-NEXT:    vmov.f32 s9, s7
134; CHECK-NEXT:    vmov.f32 s13, s0
135; CHECK-NEXT:    vmov.f32 s10, s2
136; CHECK-NEXT:    vmov.f32 s14, s3
137; CHECK-NEXT:    vmov.f32 s0, s6
138; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
139; CHECK-NEXT:    vmov.f32 s2, s16
140; CHECK-NEXT:    vmov.f32 s15, s18
141; CHECK-NEXT:    vmov.f32 s11, s17
142; CHECK-NEXT:    vadd.i32 q2, q2, q3
143; CHECK-NEXT:    vmov.f32 s3, s19
144; CHECK-NEXT:    vadd.i32 q0, q2, q0
145; CHECK-NEXT:    vldrw.u32 q2, [r0]
146; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
147; CHECK-NEXT:    vmov.f32 s16, s9
148; CHECK-NEXT:    vmov.f64 d10, d4
149; CHECK-NEXT:    vmov.f32 s17, s4
150; CHECK-NEXT:    vmov.f32 s21, s11
151; CHECK-NEXT:    vmov.f32 s18, s7
152; CHECK-NEXT:    vmov.f32 s22, s6
153; CHECK-NEXT:    vmov.f32 s4, s10
154; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
155; CHECK-NEXT:    vmov.f32 s6, s12
156; CHECK-NEXT:    vmov.f32 s19, s14
157; CHECK-NEXT:    vmov.f32 s23, s13
158; CHECK-NEXT:    vmov.f32 s7, s15
159; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
160; CHECK-NEXT:    vadd.i32 q4, q5, q4
161; CHECK-NEXT:    vmov.f32 s20, s13
162; CHECK-NEXT:    vadd.i32 q1, q4, q1
163; CHECK-NEXT:    vmov.f64 d8, d6
164; CHECK-NEXT:    vmov.f32 s17, s15
165; CHECK-NEXT:    vmov.f32 s21, s8
166; CHECK-NEXT:    vmov.f32 s18, s10
167; CHECK-NEXT:    vmov.f32 s22, s11
168; CHECK-NEXT:    vmov.f32 s8, s14
169; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
170; CHECK-NEXT:    vmov.f32 s10, s24
171; CHECK-NEXT:    vmov.f32 s23, s26
172; CHECK-NEXT:    vmov.f32 s19, s25
173; CHECK-NEXT:    vadd.i32 q4, q4, q5
174; CHECK-NEXT:    vmov.f32 s11, s27
175; CHECK-NEXT:    vadd.i32 q2, q4, q2
176; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
177; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
178; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
179; CHECK-NEXT:    vmov.f32 s24, s17
180; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
181; CHECK-NEXT:    vmov.f64 d14, d8
182; CHECK-NEXT:    vstrw.32 q1, [r1]
183; CHECK-NEXT:    vmov.f32 s25, s12
184; CHECK-NEXT:    vmov.f32 s29, s19
185; CHECK-NEXT:    vmov.f32 s26, s15
186; CHECK-NEXT:    vmov.f32 s30, s14
187; CHECK-NEXT:    vmov.f32 s12, s18
188; CHECK-NEXT:    vmov.f32 s14, s20
189; CHECK-NEXT:    vmov.f32 s27, s22
190; CHECK-NEXT:    vmov.f32 s31, s21
191; CHECK-NEXT:    vadd.i32 q6, q7, q6
192; CHECK-NEXT:    vmov.f32 s15, s23
193; CHECK-NEXT:    vadd.i32 q3, q6, q3
194; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
195; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
196; CHECK-NEXT:    bx lr
197entry:
198  %l1 = load <48 x i32>, <48 x i32>* %src, align 4
199  %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
200  %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
201  %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
202  %a1 = add <16 x i32> %s1, %s2
203  %a = add <16 x i32> %a1, %s3
204  store <16 x i32> %a, <16 x i32> *%dst
205  ret void
206}
207
208; i16
209
210define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
211; CHECK-LABEL: vld3_v2i16:
212; CHECK:       @ %bb.0: @ %entry
213; CHECK-NEXT:    .pad #8
214; CHECK-NEXT:    sub sp, #8
215; CHECK-NEXT:    vldrh.u32 q0, [r0]
216; CHECK-NEXT:    ldr r2, [r0, #8]
217; CHECK-NEXT:    mov r3, sp
218; CHECK-NEXT:    str r2, [sp]
219; CHECK-NEXT:    vmov.f64 d2, d0
220; CHECK-NEXT:    vmov.f32 s6, s3
221; CHECK-NEXT:    vmov.f32 s8, s1
222; CHECK-NEXT:    vmov.f64 d6, d1
223; CHECK-NEXT:    vmov r0, s6
224; CHECK-NEXT:    vldrh.u32 q1, [r3]
225; CHECK-NEXT:    vmov.f32 s10, s4
226; CHECK-NEXT:    vmov.f32 s14, s5
227; CHECK-NEXT:    vmov r2, s10
228; CHECK-NEXT:    add r0, r2
229; CHECK-NEXT:    vmov r2, s14
230; CHECK-NEXT:    add r0, r2
231; CHECK-NEXT:    strh r0, [r1, #2]
232; CHECK-NEXT:    vmov r0, s8
233; CHECK-NEXT:    vmov r2, s0
234; CHECK-NEXT:    add r0, r2
235; CHECK-NEXT:    vmov r2, s12
236; CHECK-NEXT:    add r0, r2
237; CHECK-NEXT:    strh r0, [r1]
238; CHECK-NEXT:    add sp, #8
239; CHECK-NEXT:    bx lr
240entry:
241  %l1 = load <6 x i16>, <6 x i16>* %src, align 4
242  %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
243  %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
244  %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
245  %a1 = add <2 x i16> %s1, %s2
246  %a = add <2 x i16> %a1, %s3
247  store <2 x i16> %a, <2 x i16> *%dst
248  ret void
249}
250
251define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
252; CHECK-LABEL: vld3_v4i16:
253; CHECK:       @ %bb.0: @ %entry
254; CHECK-NEXT:    .save {r4, r5, r6, lr}
255; CHECK-NEXT:    push {r4, r5, r6, lr}
256; CHECK-NEXT:    vldrw.u32 q0, [r0]
257; CHECK-NEXT:    vldrh.u32 q1, [r0, #16]
258; CHECK-NEXT:    vmov.u16 r5, q0[6]
259; CHECK-NEXT:    vmov.u16 r6, q0[0]
260; CHECK-NEXT:    vmov r0, r3, d2
261; CHECK-NEXT:    vmov.u16 lr, q0[2]
262; CHECK-NEXT:    vmov r2, r4, d3
263; CHECK-NEXT:    vmov q1[2], q1[0], r6, r5
264; CHECK-NEXT:    vmov.u16 r5, q0[7]
265; CHECK-NEXT:    vmov.u16 r6, q0[1]
266; CHECK-NEXT:    vmov q2[2], q2[0], r6, r5
267; CHECK-NEXT:    vmov.u16 r5, q0[3]
268; CHECK-NEXT:    vmov.u16 r6, q0[4]
269; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
270; CHECK-NEXT:    vmov q2[3], q2[1], r6, r2
271; CHECK-NEXT:    vmov.u16 r12, q0[5]
272; CHECK-NEXT:    vadd.i32 q0, q1, q2
273; CHECK-NEXT:    vmov q1[2], q1[0], lr, r0
274; CHECK-NEXT:    vmov q1[3], q1[1], r12, r4
275; CHECK-NEXT:    vadd.i32 q0, q0, q1
276; CHECK-NEXT:    vstrh.32 q0, [r1]
277; CHECK-NEXT:    pop {r4, r5, r6, pc}
278entry:
279  %l1 = load <12 x i16>, <12 x i16>* %src, align 4
280  %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
281  %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
282  %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
283  %a1 = add <4 x i16> %s1, %s2
284  %a = add <4 x i16> %a1, %s3
285  store <4 x i16> %a, <4 x i16> *%dst
286  ret void
287}
288
289define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
290; CHECK-LABEL: vld3_v8i16:
291; CHECK:       @ %bb.0: @ %entry
292; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
293; CHECK-NEXT:    vpush {d8, d9, d10, d11}
294; CHECK-NEXT:    vldrw.u32 q1, [r0]
295; CHECK-NEXT:    vmovx.f16 s8, s6
296; CHECK-NEXT:    vmov.f32 s0, s5
297; CHECK-NEXT:    vins.f16 s0, s8
298; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
299; CHECK-NEXT:    vmovx.f16 s12, s9
300; CHECK-NEXT:    vmov.f32 s1, s8
301; CHECK-NEXT:    vins.f16 s1, s12
302; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
303; CHECK-NEXT:    vmov.f32 s2, s11
304; CHECK-NEXT:    vmov.u16 r0, q2[5]
305; CHECK-NEXT:    vmovx.f16 s20, s15
306; CHECK-NEXT:    vmov.f32 s19, s14
307; CHECK-NEXT:    vins.f16 s19, s20
308; CHECK-NEXT:    vmov.f32 s18, s12
309; CHECK-NEXT:    vmov q5, q4
310; CHECK-NEXT:    vmovnb.i32 q5, q0
311; CHECK-NEXT:    vmov.f32 s2, s22
312; CHECK-NEXT:    vmovx.f16 s20, s5
313; CHECK-NEXT:    vmov.f32 s3, s19
314; CHECK-NEXT:    vmov.f64 d8, d2
315; CHECK-NEXT:    vins.f16 s16, s20
316; CHECK-NEXT:    vmovx.f16 s20, s8
317; CHECK-NEXT:    vmov.f32 s17, s7
318; CHECK-NEXT:    vins.f16 s17, s20
319; CHECK-NEXT:    vmovx.f16 s20, s11
320; CHECK-NEXT:    vmov.f32 s18, s10
321; CHECK-NEXT:    vins.f16 s18, s20
322; CHECK-NEXT:    vmovx.f16 s20, s14
323; CHECK-NEXT:    vmov.f32 s19, s13
324; CHECK-NEXT:    vins.f16 s19, s20
325; CHECK-NEXT:    vmovx.f16 s20, s4
326; CHECK-NEXT:    vins.f16 s20, s6
327; CHECK-NEXT:    vmovx.f16 s21, s7
328; CHECK-NEXT:    vins.f16 s6, s12
329; CHECK-NEXT:    vmovx.f16 s7, s13
330; CHECK-NEXT:    vins.f16 s21, s9
331; CHECK-NEXT:    vins.f16 s7, s15
332; CHECK-NEXT:    vmov.16 q5[4], r0
333; CHECK-NEXT:    vmov q2, q1
334; CHECK-NEXT:    vmovnb.i32 q2, q5
335; CHECK-NEXT:    vmov.f32 s22, s10
336; CHECK-NEXT:    vmov.f32 s23, s7
337; CHECK-NEXT:    vadd.i16 q1, q4, q5
338; CHECK-NEXT:    vadd.i16 q0, q1, q0
339; CHECK-NEXT:    vstrw.32 q0, [r1]
340; CHECK-NEXT:    vpop {d8, d9, d10, d11}
341; CHECK-NEXT:    bx lr
342entry:
343  %l1 = load <24 x i16>, <24 x i16>* %src, align 4
344  %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
345  %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
346  %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
347  %a1 = add <8 x i16> %s1, %s2
348  %a = add <8 x i16> %a1, %s3
349  store <8 x i16> %a, <8 x i16> *%dst
350  ret void
351}
352
353define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
354; CHECK-LABEL: vld3_v16i16:
355; CHECK:       @ %bb.0: @ %entry
356; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
357; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
358; CHECK-NEXT:    .pad #16
359; CHECK-NEXT:    sub sp, #16
360; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
361; CHECK-NEXT:    vmov.f64 d0, d2
362; CHECK-NEXT:    vmovx.f16 s8, s5
363; CHECK-NEXT:    vins.f16 s0, s8
364; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
365; CHECK-NEXT:    vmov.f32 s1, s7
366; CHECK-NEXT:    vmovx.f16 s12, s8
367; CHECK-NEXT:    vmovx.f16 s16, s9
368; CHECK-NEXT:    vins.f16 s1, s12
369; CHECK-NEXT:    vmovx.f16 s12, s11
370; CHECK-NEXT:    vmov.f32 s2, s10
371; CHECK-NEXT:    vmov.u16 r2, q2[5]
372; CHECK-NEXT:    vins.f16 s2, s12
373; CHECK-NEXT:    vmovx.f16 s12, s6
374; CHECK-NEXT:    vins.f16 s5, s12
375; CHECK-NEXT:    vmov.f32 s13, s8
376; CHECK-NEXT:    vins.f16 s13, s16
377; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
378; CHECK-NEXT:    vmov.f32 s12, s5
379; CHECK-NEXT:    vmovx.f16 s20, s18
380; CHECK-NEXT:    vmov.f32 s3, s17
381; CHECK-NEXT:    vins.f16 s3, s20
382; CHECK-NEXT:    vmovx.f16 s20, s19
383; CHECK-NEXT:    vins.f16 s18, s20
384; CHECK-NEXT:    vmov.f32 s14, s11
385; CHECK-NEXT:    vmov.f32 s23, s18
386; CHECK-NEXT:    vmov.f32 s22, s16
387; CHECK-NEXT:    vmov q6, q5
388; CHECK-NEXT:    vmovnb.i32 q6, q3
389; CHECK-NEXT:    vmov.f32 s14, s26
390; CHECK-NEXT:    vmov.f32 s15, s23
391; CHECK-NEXT:    vmovx.f16 s20, s4
392; CHECK-NEXT:    vins.f16 s20, s6
393; CHECK-NEXT:    vmovx.f16 s21, s7
394; CHECK-NEXT:    vins.f16 s6, s16
395; CHECK-NEXT:    vmovx.f16 s7, s17
396; CHECK-NEXT:    vins.f16 s21, s9
397; CHECK-NEXT:    vins.f16 s7, s19
398; CHECK-NEXT:    vmov.16 q5[4], r2
399; CHECK-NEXT:    vmov q2, q1
400; CHECK-NEXT:    vmovnb.i32 q2, q5
401; CHECK-NEXT:    vmov.f32 s22, s10
402; CHECK-NEXT:    vldrw.u32 q2, [r0]
403; CHECK-NEXT:    vmov.f32 s23, s7
404; CHECK-NEXT:    vadd.i16 q0, q0, q5
405; CHECK-NEXT:    vmov.f32 s4, s9
406; CHECK-NEXT:    vadd.i16 q0, q0, q3
407; CHECK-NEXT:    vmovx.f16 s12, s10
408; CHECK-NEXT:    vins.f16 s4, s12
409; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
410; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
411; CHECK-NEXT:    vmovx.f16 s0, s9
412; CHECK-NEXT:    vmovx.f16 s16, s13
413; CHECK-NEXT:    vmov.f32 s5, s12
414; CHECK-NEXT:    vins.f16 s5, s16
415; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
416; CHECK-NEXT:    vmov.f32 s6, s15
417; CHECK-NEXT:    vmov.u16 r0, q3[5]
418; CHECK-NEXT:    vmovx.f16 s20, s19
419; CHECK-NEXT:    vmov.f32 s27, s18
420; CHECK-NEXT:    vins.f16 s27, s20
421; CHECK-NEXT:    vmov.f64 d10, d4
422; CHECK-NEXT:    vins.f16 s20, s0
423; CHECK-NEXT:    vmov.f32 s26, s16
424; CHECK-NEXT:    vmovx.f16 s0, s12
425; CHECK-NEXT:    vmov.f32 s21, s11
426; CHECK-NEXT:    vins.f16 s21, s0
427; CHECK-NEXT:    vmov q7, q6
428; CHECK-NEXT:    vmovnb.i32 q7, q1
429; CHECK-NEXT:    vmovx.f16 s0, s15
430; CHECK-NEXT:    vmov.f32 s22, s14
431; CHECK-NEXT:    vins.f16 s22, s0
432; CHECK-NEXT:    vmov.f32 s6, s30
433; CHECK-NEXT:    vmov.f32 s7, s27
434; CHECK-NEXT:    vmovx.f16 s24, s8
435; CHECK-NEXT:    vmovx.f16 s0, s18
436; CHECK-NEXT:    vmov.f32 s23, s17
437; CHECK-NEXT:    vins.f16 s24, s10
438; CHECK-NEXT:    vins.f16 s23, s0
439; CHECK-NEXT:    vins.f16 s2, s16
440; CHECK-NEXT:    vmovx.f16 s25, s11
441; CHECK-NEXT:    vmovx.f16 s3, s17
442; CHECK-NEXT:    vins.f16 s25, s13
443; CHECK-NEXT:    vins.f16 s3, s19
444; CHECK-NEXT:    vmov.16 q6[4], r0
445; CHECK-NEXT:    vmov q2, q0
446; CHECK-NEXT:    vmovnb.i32 q2, q6
447; CHECK-NEXT:    vmov.f32 s26, s10
448; CHECK-NEXT:    vmov.f32 s27, s3
449; CHECK-NEXT:    vadd.i16 q0, q5, q6
450; CHECK-NEXT:    vadd.i16 q0, q0, q1
451; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
452; CHECK-NEXT:    vstrw.32 q0, [r1]
453; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
454; CHECK-NEXT:    add sp, #16
455; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
456; CHECK-NEXT:    bx lr
457entry:
458  %l1 = load <48 x i16>, <48 x i16>* %src, align 4
459  %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
460  %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
461  %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
462  %a1 = add <16 x i16> %s1, %s2
463  %a = add <16 x i16> %a1, %s3
464  store <16 x i16> %a, <16 x i16> *%dst
465  ret void
466}
467
468; i8
469
470define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) {
471; CHECK-LABEL: vld3_v2i8:
472; CHECK:       @ %bb.0: @ %entry
473; CHECK-NEXT:    .pad #8
474; CHECK-NEXT:    sub sp, #8
475; CHECK-NEXT:    ldrd r2, r0, [r0]
476; CHECK-NEXT:    strd r2, r0, [sp]
477; CHECK-NEXT:    mov r0, sp
478; CHECK-NEXT:    vldrb.u16 q0, [r0]
479; CHECK-NEXT:    vmov.u16 r0, q0[4]
480; CHECK-NEXT:    vmov.u16 r2, q0[3]
481; CHECK-NEXT:    add r0, r2
482; CHECK-NEXT:    vmov.u16 r2, q0[5]
483; CHECK-NEXT:    add r0, r2
484; CHECK-NEXT:    strb r0, [r1, #1]
485; CHECK-NEXT:    vmov.u16 r0, q0[1]
486; CHECK-NEXT:    vmov.u16 r2, q0[0]
487; CHECK-NEXT:    add r0, r2
488; CHECK-NEXT:    vmov.u16 r2, q0[2]
489; CHECK-NEXT:    add r0, r2
490; CHECK-NEXT:    strb r0, [r1]
491; CHECK-NEXT:    add sp, #8
492; CHECK-NEXT:    bx lr
493entry:
494  %l1 = load <6 x i8>, <6 x i8>* %src, align 4
495  %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
496  %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
497  %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
498  %a1 = add <2 x i8> %s1, %s2
499  %a = add <2 x i8> %a1, %s3
500  store <2 x i8> %a, <2 x i8> *%dst
501  ret void
502}
503
504define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
505; CHECK-LABEL: vld3_v4i8:
506; CHECK:       @ %bb.0: @ %entry
507; CHECK-NEXT:    .save {r4, lr}
508; CHECK-NEXT:    push {r4, lr}
509; CHECK-NEXT:    .pad #8
510; CHECK-NEXT:    sub sp, #8
511; CHECK-NEXT:    vldrb.u16 q0, [r0]
512; CHECK-NEXT:    ldr r0, [r0, #8]
513; CHECK-NEXT:    str r0, [sp]
514; CHECK-NEXT:    vmov.u16 r3, q0[6]
515; CHECK-NEXT:    vmov.u16 r4, q0[0]
516; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
517; CHECK-NEXT:    vmov.u16 r3, q0[7]
518; CHECK-NEXT:    vmov.u16 r4, q0[1]
519; CHECK-NEXT:    vmov.u16 r12, q0[5]
520; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
521; CHECK-NEXT:    mov r3, sp
522; CHECK-NEXT:    vmov.u16 lr, q0[2]
523; CHECK-NEXT:    vmov.u16 r2, q0[3]
524; CHECK-NEXT:    vmov.u16 r0, q0[4]
525; CHECK-NEXT:    vldrb.u16 q0, [r3]
526; CHECK-NEXT:    vmov.u16 r3, q0[2]
527; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
528; CHECK-NEXT:    vmov.u16 r0, q0[1]
529; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
530; CHECK-NEXT:    vmov.u16 r0, q0[0]
531; CHECK-NEXT:    vadd.i32 q1, q1, q2
532; CHECK-NEXT:    vmov q2[2], q2[0], lr, r0
533; CHECK-NEXT:    vmov.u16 r0, q0[3]
534; CHECK-NEXT:    vmov q2[3], q2[1], r12, r0
535; CHECK-NEXT:    vadd.i32 q0, q1, q2
536; CHECK-NEXT:    vstrb.32 q0, [r1]
537; CHECK-NEXT:    add sp, #8
538; CHECK-NEXT:    pop {r4, pc}
539entry:
540  %l1 = load <12 x i8>, <12 x i8>* %src, align 4
541  %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
542  %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
543  %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
544  %a1 = add <4 x i8> %s1, %s2
545  %a = add <4 x i8> %a1, %s3
546  store <4 x i8> %a, <4 x i8> *%dst
547  ret void
548}
549
550define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
551; CHECK-LABEL: vld3_v8i8:
552; CHECK:       @ %bb.0: @ %entry
553; CHECK-NEXT:    .vsave {d8, d9}
554; CHECK-NEXT:    vpush {d8, d9}
555; CHECK-NEXT:    vldrw.u32 q0, [r0]
556; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
557; CHECK-NEXT:    vmov.u8 r2, q0[1]
558; CHECK-NEXT:    vmov.u8 r0, q0[0]
559; CHECK-NEXT:    vmov.16 q2[0], r2
560; CHECK-NEXT:    vmov.u8 r2, q0[4]
561; CHECK-NEXT:    vmov.16 q2[1], r2
562; CHECK-NEXT:    vmov.u8 r2, q0[7]
563; CHECK-NEXT:    vmov.16 q3[0], r0
564; CHECK-NEXT:    vmov.u8 r0, q0[3]
565; CHECK-NEXT:    vmov.16 q2[2], r2
566; CHECK-NEXT:    vmov.u8 r2, q0[10]
567; CHECK-NEXT:    vmov.16 q3[1], r0
568; CHECK-NEXT:    vmov.u8 r0, q0[6]
569; CHECK-NEXT:    vmov.16 q2[3], r2
570; CHECK-NEXT:    vmov.u8 r2, q0[13]
571; CHECK-NEXT:    vmov.16 q3[2], r0
572; CHECK-NEXT:    vmov.u8 r0, q0[9]
573; CHECK-NEXT:    vmov.16 q2[4], r2
574; CHECK-NEXT:    vmov.16 q3[3], r0
575; CHECK-NEXT:    vmov.u8 r0, q0[12]
576; CHECK-NEXT:    vins.f16 s10, s4
577; CHECK-NEXT:    vmov.16 q3[4], r0
578; CHECK-NEXT:    vmov.u8 r0, q0[15]
579; CHECK-NEXT:    vmovx.f16 s16, s6
580; CHECK-NEXT:    vmov.f32 s18, s5
581; CHECK-NEXT:    vmovx.f16 s11, s5
582; CHECK-NEXT:    vmov.16 q3[5], r0
583; CHECK-NEXT:    vins.f16 s18, s16
584; CHECK-NEXT:    vins.f16 s11, s7
585; CHECK-NEXT:    vmov.f32 s15, s18
586; CHECK-NEXT:    vmov.u8 r0, q0[2]
587; CHECK-NEXT:    vadd.i16 q2, q3, q2
588; CHECK-NEXT:    vmov.16 q3[0], r0
589; CHECK-NEXT:    vmov.u8 r0, q0[5]
590; CHECK-NEXT:    vmov.16 q3[1], r0
591; CHECK-NEXT:    vmov.u8 r0, q0[8]
592; CHECK-NEXT:    vmov.16 q3[2], r0
593; CHECK-NEXT:    vmov.u8 r0, q0[11]
594; CHECK-NEXT:    vmov.16 q3[3], r0
595; CHECK-NEXT:    vmov.u8 r0, q0[14]
596; CHECK-NEXT:    vmov.16 q3[4], r0
597; CHECK-NEXT:    vmov.u16 r0, q1[1]
598; CHECK-NEXT:    vmovx.f16 s0, s7
599; CHECK-NEXT:    vmov.f32 s2, s6
600; CHECK-NEXT:    vins.f16 s2, s0
601; CHECK-NEXT:    vmov.16 q3[5], r0
602; CHECK-NEXT:    vmov.f32 s15, s2
603; CHECK-NEXT:    vadd.i16 q0, q2, q3
604; CHECK-NEXT:    vstrb.16 q0, [r1]
605; CHECK-NEXT:    vpop {d8, d9}
606; CHECK-NEXT:    bx lr
607entry:
608  %l1 = load <24 x i8>, <24 x i8>* %src, align 4
609  %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
610  %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
611  %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
612  %a1 = add <8 x i8> %s1, %s2
613  %a = add <8 x i8> %a1, %s3
614  store <8 x i8> %a, <8 x i8> *%dst
615  ret void
616}
617
618define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
619; CHECK-LABEL: vld3_v16i8:
620; CHECK:       @ %bb.0: @ %entry
621; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
622; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
623; CHECK-NEXT:    vldrw.u32 q1, [r0]
624; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
625; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
626; CHECK-NEXT:    vmov.u8 r2, q1[1]
627; CHECK-NEXT:    vmov.8 q3[0], r2
628; CHECK-NEXT:    vmov.u8 r2, q1[4]
629; CHECK-NEXT:    vmov.8 q3[1], r2
630; CHECK-NEXT:    vmov.u8 r2, q1[7]
631; CHECK-NEXT:    vmov.8 q3[2], r2
632; CHECK-NEXT:    vmov.u8 r2, q1[10]
633; CHECK-NEXT:    vmov.8 q3[3], r2
634; CHECK-NEXT:    vmov.u8 r2, q1[13]
635; CHECK-NEXT:    vmov.8 q3[4], r2
636; CHECK-NEXT:    vmov.u8 r2, q0[0]
637; CHECK-NEXT:    vmov.8 q3[5], r2
638; CHECK-NEXT:    vmov.u8 r2, q0[3]
639; CHECK-NEXT:    vmov.8 q3[6], r2
640; CHECK-NEXT:    vmov.u8 r2, q0[6]
641; CHECK-NEXT:    vmov.8 q3[7], r2
642; CHECK-NEXT:    vmov.u8 r2, q0[9]
643; CHECK-NEXT:    vmov.u8 r0, q2[5]
644; CHECK-NEXT:    vmov.8 q3[8], r2
645; CHECK-NEXT:    vmov.u8 r2, q0[12]
646; CHECK-NEXT:    vmov.8 q4[12], r0
647; CHECK-NEXT:    vmov.u8 r0, q2[8]
648; CHECK-NEXT:    vmov.8 q3[9], r2
649; CHECK-NEXT:    vmov.u8 r2, q0[15]
650; CHECK-NEXT:    vmov.8 q4[13], r0
651; CHECK-NEXT:    vmov.u8 r0, q2[11]
652; CHECK-NEXT:    vmov.8 q3[10], r2
653; CHECK-NEXT:    vmov.8 q4[14], r0
654; CHECK-NEXT:    vmov.u8 r0, q2[14]
655; CHECK-NEXT:    vmov.8 q4[15], r0
656; CHECK-NEXT:    vmov.u8 r0, q2[2]
657; CHECK-NEXT:    vmov q5, q3
658; CHECK-NEXT:    vmov.8 q5[11], r0
659; CHECK-NEXT:    vmov.u8 r0, q1[0]
660; CHECK-NEXT:    vmov.f32 s14, s22
661; CHECK-NEXT:    vmov.f32 s15, s19
662; CHECK-NEXT:    vmov.8 q4[0], r0
663; CHECK-NEXT:    vmov.u8 r0, q1[3]
664; CHECK-NEXT:    vmov.8 q4[1], r0
665; CHECK-NEXT:    vmov.u8 r0, q1[6]
666; CHECK-NEXT:    vmov.8 q4[2], r0
667; CHECK-NEXT:    vmov.u8 r0, q1[9]
668; CHECK-NEXT:    vmov.8 q4[3], r0
669; CHECK-NEXT:    vmov.u8 r0, q1[12]
670; CHECK-NEXT:    vmov.8 q4[4], r0
671; CHECK-NEXT:    vmov.u8 r0, q1[15]
672; CHECK-NEXT:    vmov.8 q4[5], r0
673; CHECK-NEXT:    vmov.u8 r0, q0[2]
674; CHECK-NEXT:    vmov.8 q4[6], r0
675; CHECK-NEXT:    vmov.u8 r0, q0[5]
676; CHECK-NEXT:    vmov.8 q4[7], r0
677; CHECK-NEXT:    vmov.u8 r0, q0[8]
678; CHECK-NEXT:    vmov.8 q4[8], r0
679; CHECK-NEXT:    vmov.u8 r0, q0[11]
680; CHECK-NEXT:    vmov.8 q4[9], r0
681; CHECK-NEXT:    vmov.u8 r0, q0[14]
682; CHECK-NEXT:    vmov.8 q4[10], r0
683; CHECK-NEXT:    vmov.u8 r0, q2[4]
684; CHECK-NEXT:    vmov.8 q5[12], r0
685; CHECK-NEXT:    vmov.u8 r0, q2[7]
686; CHECK-NEXT:    vmov.8 q5[13], r0
687; CHECK-NEXT:    vmov.u8 r0, q2[10]
688; CHECK-NEXT:    vmov.8 q5[14], r0
689; CHECK-NEXT:    vmov.u8 r0, q2[13]
690; CHECK-NEXT:    vmov.8 q5[15], r0
691; CHECK-NEXT:    vmov.u8 r0, q2[1]
692; CHECK-NEXT:    vmov q6, q4
693; CHECK-NEXT:    vmov.8 q6[11], r0
694; CHECK-NEXT:    vmov.u8 r0, q1[2]
695; CHECK-NEXT:    vmov.f32 s18, s26
696; CHECK-NEXT:    vmov.f32 s19, s23
697; CHECK-NEXT:    vadd.i8 q3, q4, q3
698; CHECK-NEXT:    vmov.8 q4[0], r0
699; CHECK-NEXT:    vmov.u8 r0, q1[5]
700; CHECK-NEXT:    vmov.8 q4[1], r0
701; CHECK-NEXT:    vmov.u8 r0, q1[8]
702; CHECK-NEXT:    vmov.8 q4[2], r0
703; CHECK-NEXT:    vmov.u8 r0, q1[11]
704; CHECK-NEXT:    vmov.8 q4[3], r0
705; CHECK-NEXT:    vmov.u8 r0, q1[14]
706; CHECK-NEXT:    vmov.8 q4[4], r0
707; CHECK-NEXT:    vmov.u8 r0, q0[1]
708; CHECK-NEXT:    vmov.8 q4[5], r0
709; CHECK-NEXT:    vmov.u8 r0, q0[4]
710; CHECK-NEXT:    vmov.8 q4[6], r0
711; CHECK-NEXT:    vmov.u8 r0, q2[6]
712; CHECK-NEXT:    vmov.8 q1[12], r0
713; CHECK-NEXT:    vmov.u8 r0, q2[9]
714; CHECK-NEXT:    vmov.8 q1[13], r0
715; CHECK-NEXT:    vmov.u8 r0, q2[12]
716; CHECK-NEXT:    vmov.8 q1[14], r0
717; CHECK-NEXT:    vmov.u8 r0, q2[15]
718; CHECK-NEXT:    vmov.8 q1[15], r0
719; CHECK-NEXT:    vmov.u8 r0, q0[10]
720; CHECK-NEXT:    vmov.8 q5[8], r0
721; CHECK-NEXT:    vmov.u8 r0, q0[13]
722; CHECK-NEXT:    vmov.8 q5[9], r0
723; CHECK-NEXT:    vmov.u8 r0, q2[0]
724; CHECK-NEXT:    vmov.8 q5[10], r0
725; CHECK-NEXT:    vmov.u8 r0, q2[3]
726; CHECK-NEXT:    vmov.8 q5[11], r0
727; CHECK-NEXT:    vmov.u8 r0, q0[7]
728; CHECK-NEXT:    vmov.8 q4[7], r0
729; CHECK-NEXT:    vmov.f32 s18, s22
730; CHECK-NEXT:    vmov.f32 s19, s7
731; CHECK-NEXT:    vadd.i8 q0, q3, q4
732; CHECK-NEXT:    vstrw.32 q0, [r1]
733; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
734; CHECK-NEXT:    bx lr
735entry:
736  %l1 = load <48 x i8>, <48 x i8>* %src, align 4
737  %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
738  %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
739  %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
740  %a1 = add <16 x i8> %s1, %s2
741  %a = add <16 x i8> %a1, %s3
742  store <16 x i8> %a, <16 x i8> *%dst
743  ret void
744}
745
746; i64
747
748define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
749; CHECK-LABEL: vld3_v2i64:
750; CHECK:       @ %bb.0: @ %entry
751; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
752; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
753; CHECK-NEXT:    vldrw.u32 q0, [r0]
754; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
755; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
756; CHECK-NEXT:    vmov.f64 d6, d1
757; CHECK-NEXT:    vmov.f32 s13, s3
758; CHECK-NEXT:    vmov.f32 s14, s4
759; CHECK-NEXT:    vmov.f32 s2, s10
760; CHECK-NEXT:    vmov.f32 s3, s11
761; CHECK-NEXT:    vmov.f32 s15, s5
762; CHECK-NEXT:    vmov.f32 s10, s6
763; CHECK-NEXT:    vmov.f32 s11, s7
764; CHECK-NEXT:    vmov r5, r8, d6
765; CHECK-NEXT:    vmov r6, r7, d0
766; CHECK-NEXT:    vmov r0, r3, d1
767; CHECK-NEXT:    vmov lr, r12, d7
768; CHECK-NEXT:    vmov r2, r4, d5
769; CHECK-NEXT:    adds.w r0, r0, lr
770; CHECK-NEXT:    adc.w r3, r3, r12
771; CHECK-NEXT:    adds r0, r0, r2
772; CHECK-NEXT:    adc.w r2, r3, r4
773; CHECK-NEXT:    vmov r3, r4, d4
774; CHECK-NEXT:    adds r6, r6, r5
775; CHECK-NEXT:    adc.w r7, r7, r8
776; CHECK-NEXT:    adds r3, r3, r6
777; CHECK-NEXT:    adcs r7, r4
778; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
779; CHECK-NEXT:    vmov q0[3], q0[1], r7, r2
780; CHECK-NEXT:    vstrw.32 q0, [r1]
781; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
782entry:
783  %l1 = load <6 x i64>, <6 x i64>* %src, align 4
784  %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
785  %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
786  %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
787  %a1 = add <2 x i64> %s1, %s2
788  %a = add <2 x i64> %a1, %s3
789  store <2 x i64> %a, <2 x i64> *%dst
790  ret void
791}
792
793define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
794; CHECK-LABEL: vld3_v4i64:
795; CHECK:       @ %bb.0: @ %entry
796; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
797; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
798; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
799; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
800; CHECK-NEXT:    vldrw.u32 q0, [r0]
801; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
802; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
803; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
804; CHECK-NEXT:    vmov.f64 d2, d1
805; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
806; CHECK-NEXT:    vmov.f32 s5, s3
807; CHECK-NEXT:    vmov.f32 s6, s12
808; CHECK-NEXT:    vmov.f32 s2, s10
809; CHECK-NEXT:    vmov.f32 s3, s11
810; CHECK-NEXT:    vmov.f32 s10, s14
811; CHECK-NEXT:    vmov.f32 s7, s13
812; CHECK-NEXT:    vmov.f32 s11, s15
813; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
814; CHECK-NEXT:    vmov.f64 d10, d7
815; CHECK-NEXT:    vmov lr, r12, d3
816; CHECK-NEXT:    vmov r5, r4, d1
817; CHECK-NEXT:    vmov r3, r8, d5
818; CHECK-NEXT:    vmov.f32 s21, s15
819; CHECK-NEXT:    vmov.f32 s22, s24
820; CHECK-NEXT:    vmov.f32 s14, s18
821; CHECK-NEXT:    vmov.f32 s23, s25
822; CHECK-NEXT:    vmov.f32 s15, s19
823; CHECK-NEXT:    vmov.f32 s18, s26
824; CHECK-NEXT:    vmov r6, r7, d10
825; CHECK-NEXT:    vmov.f32 s19, s27
826; CHECK-NEXT:    adds.w r0, r5, lr
827; CHECK-NEXT:    adc.w r5, r4, r12
828; CHECK-NEXT:    adds.w lr, r0, r3
829; CHECK-NEXT:    vmov r4, r2, d6
830; CHECK-NEXT:    adc.w r12, r5, r8
831; CHECK-NEXT:    vmov r5, r0, d8
832; CHECK-NEXT:    adds r6, r6, r4
833; CHECK-NEXT:    adcs r2, r7
834; CHECK-NEXT:    adds r6, r6, r5
835; CHECK-NEXT:    adc.w r8, r2, r0
836; CHECK-NEXT:    vmov r7, r4, d11
837; CHECK-NEXT:    vmov r2, r5, d7
838; CHECK-NEXT:    vmov r3, r0, d0
839; CHECK-NEXT:    adds r2, r2, r7
840; CHECK-NEXT:    adc.w r7, r5, r4
841; CHECK-NEXT:    vmov r5, r4, d9
842; CHECK-NEXT:    adds r2, r2, r5
843; CHECK-NEXT:    adcs r7, r4
844; CHECK-NEXT:    vmov r5, r4, d2
845; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
846; CHECK-NEXT:    vmov q1[3], q1[1], r8, r7
847; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
848; CHECK-NEXT:    adds r3, r3, r5
849; CHECK-NEXT:    adcs r0, r4
850; CHECK-NEXT:    vmov r4, r5, d4
851; CHECK-NEXT:    adds r3, r3, r4
852; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
853; CHECK-NEXT:    adcs r0, r5
854; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
855; CHECK-NEXT:    vstrw.32 q0, [r1]
856; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
857; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
858entry:
859  %l1 = load <12 x i64>, <12 x i64>* %src, align 4
860  %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
861  %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
862  %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
863  %a1 = add <4 x i64> %s1, %s2
864  %a = add <4 x i64> %a1, %s3
865  store <4 x i64> %a, <4 x i64> *%dst
866  ret void
867}
868
869; f32
870
871define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
872; CHECK-LABEL: vld3_v2f32:
873; CHECK:       @ %bb.0: @ %entry
874; CHECK-NEXT:    vldrw.u32 q2, [r0]
875; CHECK-NEXT:    vldr s1, [r0, #16]
876; CHECK-NEXT:    vldr s5, [r0, #20]
877; CHECK-NEXT:    vmov.f64 d6, d4
878; CHECK-NEXT:    vmov.f32 s13, s11
879; CHECK-NEXT:    vmov.f32 s0, s9
880; CHECK-NEXT:    vadd.f32 q0, q3, q0
881; CHECK-NEXT:    vmov.f32 s4, s10
882; CHECK-NEXT:    vadd.f32 q0, q0, q1
883; CHECK-NEXT:    vstmia r1, {s0, s1}
884; CHECK-NEXT:    bx lr
885entry:
886  %l1 = load <6 x float>, <6 x float>* %src, align 4
887  %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
888  %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4>
889  %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5>
890  %a1 = fadd <2 x float> %s1, %s2
891  %a = fadd <2 x float> %a1, %s3
892  store <2 x float> %a, <2 x float> *%dst
893  ret void
894}
895
896define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
897; CHECK-LABEL: vld3_v4f32:
898; CHECK:       @ %bb.0: @ %entry
899; CHECK-NEXT:    .vsave {d8, d9}
900; CHECK-NEXT:    vpush {d8, d9}
901; CHECK-NEXT:    vldrw.u32 q1, [r0]
902; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
903; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
904; CHECK-NEXT:    vmov.f64 d4, d2
905; CHECK-NEXT:    vmov.f32 s12, s5
906; CHECK-NEXT:    vmov.f32 s9, s7
907; CHECK-NEXT:    vmov.f32 s13, s0
908; CHECK-NEXT:    vmov.f32 s10, s2
909; CHECK-NEXT:    vmov.f32 s14, s3
910; CHECK-NEXT:    vmov.f32 s0, s6
911; CHECK-NEXT:    vmov.f32 s2, s16
912; CHECK-NEXT:    vmov.f32 s15, s18
913; CHECK-NEXT:    vmov.f32 s11, s17
914; CHECK-NEXT:    vadd.f32 q2, q2, q3
915; CHECK-NEXT:    vmov.f32 s3, s19
916; CHECK-NEXT:    vadd.f32 q0, q2, q0
917; CHECK-NEXT:    vstrw.32 q0, [r1]
918; CHECK-NEXT:    vpop {d8, d9}
919; CHECK-NEXT:    bx lr
920entry:
921  %l1 = load <12 x float>, <12 x float>* %src, align 4
922  %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
923  %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
924  %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
925  %a1 = fadd <4 x float> %s1, %s2
926  %a = fadd <4 x float> %a1, %s3
927  store <4 x float> %a, <4 x float> *%dst
928  ret void
929}
930
931define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
932; CHECK-LABEL: vld3_v8f32:
933; CHECK:       @ %bb.0: @ %entry
934; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
935; CHECK-NEXT:    vpush {d8, d9, d10, d11}
936; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
937; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
938; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
939; CHECK-NEXT:    vmov.f64 d4, d2
940; CHECK-NEXT:    vmov.f32 s12, s5
941; CHECK-NEXT:    vmov.f32 s9, s7
942; CHECK-NEXT:    vmov.f32 s13, s0
943; CHECK-NEXT:    vmov.f32 s10, s2
944; CHECK-NEXT:    vmov.f32 s14, s3
945; CHECK-NEXT:    vmov.f32 s0, s6
946; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
947; CHECK-NEXT:    vmov.f32 s2, s16
948; CHECK-NEXT:    vmov.f32 s15, s18
949; CHECK-NEXT:    vmov.f32 s11, s17
950; CHECK-NEXT:    vadd.f32 q2, q2, q3
951; CHECK-NEXT:    vmov.f32 s3, s19
952; CHECK-NEXT:    vadd.f32 q0, q2, q0
953; CHECK-NEXT:    vldrw.u32 q2, [r0]
954; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
955; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
956; CHECK-NEXT:    vmov.f32 s16, s9
957; CHECK-NEXT:    vmov.f64 d10, d4
958; CHECK-NEXT:    vmov.f32 s17, s4
959; CHECK-NEXT:    vmov.f32 s21, s11
960; CHECK-NEXT:    vmov.f32 s18, s7
961; CHECK-NEXT:    vmov.f32 s22, s6
962; CHECK-NEXT:    vmov.f32 s4, s10
963; CHECK-NEXT:    vmov.f32 s6, s12
964; CHECK-NEXT:    vmov.f32 s19, s14
965; CHECK-NEXT:    vmov.f32 s23, s13
966; CHECK-NEXT:    vadd.f32 q4, q5, q4
967; CHECK-NEXT:    vmov.f32 s7, s15
968; CHECK-NEXT:    vadd.f32 q1, q4, q1
969; CHECK-NEXT:    vstrw.32 q1, [r1]
970; CHECK-NEXT:    vpop {d8, d9, d10, d11}
971; CHECK-NEXT:    bx lr
972entry:
973  %l1 = load <24 x float>, <24 x float>* %src, align 4
974  %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
975  %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
976  %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
977  %a1 = fadd <8 x float> %s1, %s2
978  %a = fadd <8 x float> %a1, %s3
979  store <8 x float> %a, <8 x float> *%dst
980  ret void
981}
982
983define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
984; CHECK-LABEL: vld3_v16f32:
985; CHECK:       @ %bb.0: @ %entry
986; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
987; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
988; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
989; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
990; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
991; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
992; CHECK-NEXT:    vmov.f64 d4, d2
993; CHECK-NEXT:    vmov.f32 s12, s5
994; CHECK-NEXT:    vmov.f32 s9, s7
995; CHECK-NEXT:    vmov.f32 s13, s0
996; CHECK-NEXT:    vmov.f32 s10, s2
997; CHECK-NEXT:    vmov.f32 s14, s3
998; CHECK-NEXT:    vmov.f32 s0, s6
999; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1000; CHECK-NEXT:    vmov.f32 s2, s16
1001; CHECK-NEXT:    vmov.f32 s15, s18
1002; CHECK-NEXT:    vmov.f32 s11, s17
1003; CHECK-NEXT:    vadd.f32 q2, q2, q3
1004; CHECK-NEXT:    vmov.f32 s3, s19
1005; CHECK-NEXT:    vadd.f32 q0, q2, q0
1006; CHECK-NEXT:    vldrw.u32 q2, [r0]
1007; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1008; CHECK-NEXT:    vmov.f32 s16, s9
1009; CHECK-NEXT:    vmov.f64 d10, d4
1010; CHECK-NEXT:    vmov.f32 s17, s4
1011; CHECK-NEXT:    vmov.f32 s21, s11
1012; CHECK-NEXT:    vmov.f32 s18, s7
1013; CHECK-NEXT:    vmov.f32 s22, s6
1014; CHECK-NEXT:    vmov.f32 s4, s10
1015; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
1016; CHECK-NEXT:    vmov.f32 s6, s12
1017; CHECK-NEXT:    vmov.f32 s19, s14
1018; CHECK-NEXT:    vmov.f32 s23, s13
1019; CHECK-NEXT:    vmov.f32 s7, s15
1020; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
1021; CHECK-NEXT:    vadd.f32 q4, q5, q4
1022; CHECK-NEXT:    vmov.f32 s20, s13
1023; CHECK-NEXT:    vadd.f32 q1, q4, q1
1024; CHECK-NEXT:    vmov.f64 d8, d6
1025; CHECK-NEXT:    vmov.f32 s17, s15
1026; CHECK-NEXT:    vmov.f32 s21, s8
1027; CHECK-NEXT:    vmov.f32 s18, s10
1028; CHECK-NEXT:    vmov.f32 s22, s11
1029; CHECK-NEXT:    vmov.f32 s8, s14
1030; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
1031; CHECK-NEXT:    vmov.f32 s10, s24
1032; CHECK-NEXT:    vmov.f32 s23, s26
1033; CHECK-NEXT:    vmov.f32 s19, s25
1034; CHECK-NEXT:    vadd.f32 q4, q4, q5
1035; CHECK-NEXT:    vmov.f32 s11, s27
1036; CHECK-NEXT:    vadd.f32 q2, q4, q2
1037; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
1038; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
1039; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
1040; CHECK-NEXT:    vmov.f32 s24, s17
1041; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1042; CHECK-NEXT:    vmov.f64 d14, d8
1043; CHECK-NEXT:    vstrw.32 q1, [r1]
1044; CHECK-NEXT:    vmov.f32 s25, s12
1045; CHECK-NEXT:    vmov.f32 s29, s19
1046; CHECK-NEXT:    vmov.f32 s26, s15
1047; CHECK-NEXT:    vmov.f32 s30, s14
1048; CHECK-NEXT:    vmov.f32 s12, s18
1049; CHECK-NEXT:    vmov.f32 s14, s20
1050; CHECK-NEXT:    vmov.f32 s27, s22
1051; CHECK-NEXT:    vmov.f32 s31, s21
1052; CHECK-NEXT:    vadd.f32 q6, q7, q6
1053; CHECK-NEXT:    vmov.f32 s15, s23
1054; CHECK-NEXT:    vadd.f32 q3, q6, q3
1055; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
1056; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1057; CHECK-NEXT:    bx lr
1058entry:
1059  %l1 = load <48 x float>, <48 x float>* %src, align 4
1060  %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1061  %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1062  %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1063  %a1 = fadd <16 x float> %s1, %s2
1064  %a = fadd <16 x float> %a1, %s3
1065  store <16 x float> %a, <16 x float> *%dst
1066  ret void
1067}
1068
1069; f16
1070
1071define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
1072; CHECK-LABEL: vld3_v2f16:
1073; CHECK:       @ %bb.0: @ %entry
1074; CHECK-NEXT:    ldrd r2, r3, [r0]
1075; CHECK-NEXT:    ldr r0, [r0, #8]
1076; CHECK-NEXT:    vmov.32 q0[0], r2
1077; CHECK-NEXT:    vmov.32 q0[1], r3
1078; CHECK-NEXT:    vmov.32 q0[2], r0
1079; CHECK-NEXT:    vmovx.f16 s8, s0
1080; CHECK-NEXT:    vmovx.f16 s4, s2
1081; CHECK-NEXT:    vins.f16 s8, s2
1082; CHECK-NEXT:    vmovx.f16 s6, s1
1083; CHECK-NEXT:    vins.f16 s1, s4
1084; CHECK-NEXT:    vins.f16 s0, s6
1085; CHECK-NEXT:    vadd.f16 q1, q0, q2
1086; CHECK-NEXT:    vmov.f32 s0, s1
1087; CHECK-NEXT:    vadd.f16 q0, q1, q0
1088; CHECK-NEXT:    vmov r0, s0
1089; CHECK-NEXT:    str r0, [r1]
1090; CHECK-NEXT:    bx lr
1091entry:
1092  %l1 = load <6 x half>, <6 x half>* %src, align 4
1093  %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3>
1094  %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4>
1095  %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5>
1096  %a1 = fadd <2 x half> %s1, %s2
1097  %a = fadd <2 x half> %a1, %s3
1098  store <2 x half> %a, <2 x half> *%dst
1099  ret void
1100}
1101
1102define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
1103; CHECK-LABEL: vld3_v4f16:
1104; CHECK:       @ %bb.0: @ %entry
1105; CHECK-NEXT:    .vsave {d8}
1106; CHECK-NEXT:    vpush {d8}
1107; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
1108; CHECK-NEXT:    vmov.32 q2[0], r2
1109; CHECK-NEXT:    vmov.32 q2[1], r3
1110; CHECK-NEXT:    vmov.f32 s1, s8
1111; CHECK-NEXT:    vmovx.f16 s4, s9
1112; CHECK-NEXT:    vins.f16 s1, s4
1113; CHECK-NEXT:    vldrw.u32 q1, [r0]
1114; CHECK-NEXT:    vmovx.f16 s8, s8
1115; CHECK-NEXT:    vmovx.f16 s12, s4
1116; CHECK-NEXT:    vmovx.f16 s16, s5
1117; CHECK-NEXT:    vins.f16 s12, s6
1118; CHECK-NEXT:    vins.f16 s4, s16
1119; CHECK-NEXT:    vmovx.f16 s16, s6
1120; CHECK-NEXT:    vins.f16 s5, s16
1121; CHECK-NEXT:    vmovx.f16 s13, s7
1122; CHECK-NEXT:    vins.f16 s7, s8
1123; CHECK-NEXT:    vmov.f32 s0, s5
1124; CHECK-NEXT:    vins.f16 s13, s9
1125; CHECK-NEXT:    vmov.f32 s5, s7
1126; CHECK-NEXT:    vadd.f16 q1, q1, q3
1127; CHECK-NEXT:    vadd.f16 q0, q1, q0
1128; CHECK-NEXT:    vmov r0, r2, d0
1129; CHECK-NEXT:    strd r0, r2, [r1]
1130; CHECK-NEXT:    vpop {d8}
1131; CHECK-NEXT:    bx lr
1132entry:
1133  %l1 = load <12 x half>, <12 x half>* %src, align 4
1134  %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1135  %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1136  %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1137  %a1 = fadd <4 x half> %s1, %s2
1138  %a = fadd <4 x half> %a1, %s3
1139  store <4 x half> %a, <4 x half> *%dst
1140  ret void
1141}
1142
1143define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
1144; CHECK-LABEL: vld3_v8f16:
1145; CHECK:       @ %bb.0: @ %entry
1146; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1147; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1148; CHECK-NEXT:    vldrw.u32 q0, [r0]
1149; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
1150; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1151; CHECK-NEXT:    vmovx.f16 s8, s2
1152; CHECK-NEXT:    vmov.f32 s4, s1
1153; CHECK-NEXT:    vins.f16 s4, s8
1154; CHECK-NEXT:    vmovx.f16 s8, s17
1155; CHECK-NEXT:    vmov.f32 s5, s16
1156; CHECK-NEXT:    vmovx.f16 s20, s15
1157; CHECK-NEXT:    vins.f16 s5, s8
1158; CHECK-NEXT:    vmov.f32 s11, s14
1159; CHECK-NEXT:    vins.f16 s11, s20
1160; CHECK-NEXT:    vmov.f32 s6, s19
1161; CHECK-NEXT:    vmovx.f16 s20, s12
1162; CHECK-NEXT:    vmov.f32 s28, s18
1163; CHECK-NEXT:    vins.f16 s6, s20
1164; CHECK-NEXT:    vmovx.f16 s20, s19
1165; CHECK-NEXT:    vins.f16 s28, s20
1166; CHECK-NEXT:    vmovx.f16 s24, s1
1167; CHECK-NEXT:    vmovx.f16 s20, s0
1168; CHECK-NEXT:    vins.f16 s0, s24
1169; CHECK-NEXT:    vins.f16 s20, s2
1170; CHECK-NEXT:    vmovx.f16 s26, s16
1171; CHECK-NEXT:    vmovx.f16 s21, s3
1172; CHECK-NEXT:    vins.f16 s3, s26
1173; CHECK-NEXT:    vins.f16 s21, s17
1174; CHECK-NEXT:    vmovx.f16 s30, s14
1175; CHECK-NEXT:    vmovx.f16 s23, s13
1176; CHECK-NEXT:    vmov.f32 s10, s12
1177; CHECK-NEXT:    vmov.f32 s1, s3
1178; CHECK-NEXT:    vins.f16 s13, s30
1179; CHECK-NEXT:    vins.f16 s23, s15
1180; CHECK-NEXT:    vmov.f32 s2, s28
1181; CHECK-NEXT:    vmovx.f16 s22, s18
1182; CHECK-NEXT:    vmov.f32 s3, s13
1183; CHECK-NEXT:    vins.f16 s22, s12
1184; CHECK-NEXT:    vmov.f32 s7, s11
1185; CHECK-NEXT:    vadd.f16 q0, q0, q5
1186; CHECK-NEXT:    vadd.f16 q0, q0, q1
1187; CHECK-NEXT:    vstrw.32 q0, [r1]
1188; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1189; CHECK-NEXT:    bx lr
1190entry:
1191  %l1 = load <24 x half>, <24 x half>* %src, align 4
1192  %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1193  %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1194  %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1195  %a1 = fadd <8 x half> %s1, %s2
1196  %a = fadd <8 x half> %a1, %s3
1197  store <8 x half> %a, <8 x half> *%dst
1198  ret void
1199}
1200
1201define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
1202; CHECK-LABEL: vld3_v16f16:
1203; CHECK:       @ %bb.0: @ %entry
1204; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1205; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1206; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
1207; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
1208; CHECK-NEXT:    vmovx.f16 s8, s2
1209; CHECK-NEXT:    vmov.f32 s4, s1
1210; CHECK-NEXT:    vins.f16 s4, s8
1211; CHECK-NEXT:    vmovx.f16 s8, s13
1212; CHECK-NEXT:    vmov.f32 s5, s12
1213; CHECK-NEXT:    vmovx.f16 s24, s1
1214; CHECK-NEXT:    vins.f16 s5, s8
1215; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
1216; CHECK-NEXT:    vmov.f32 s6, s15
1217; CHECK-NEXT:    vmovx.f16 s26, s12
1218; CHECK-NEXT:    vmovx.f16 s20, s11
1219; CHECK-NEXT:    vmov.f32 s19, s10
1220; CHECK-NEXT:    vins.f16 s19, s20
1221; CHECK-NEXT:    vmovx.f16 s20, s8
1222; CHECK-NEXT:    vins.f16 s6, s20
1223; CHECK-NEXT:    vmovx.f16 s20, s15
1224; CHECK-NEXT:    vmov.f32 s28, s14
1225; CHECK-NEXT:    vmovx.f16 s30, s10
1226; CHECK-NEXT:    vins.f16 s28, s20
1227; CHECK-NEXT:    vmovx.f16 s20, s0
1228; CHECK-NEXT:    vins.f16 s0, s24
1229; CHECK-NEXT:    vins.f16 s20, s2
1230; CHECK-NEXT:    vmovx.f16 s21, s3
1231; CHECK-NEXT:    vins.f16 s3, s26
1232; CHECK-NEXT:    vins.f16 s21, s13
1233; CHECK-NEXT:    vmov.f32 s18, s8
1234; CHECK-NEXT:    vmovx.f16 s23, s9
1235; CHECK-NEXT:    vmov.f32 s1, s3
1236; CHECK-NEXT:    vins.f16 s9, s30
1237; CHECK-NEXT:    vins.f16 s23, s11
1238; CHECK-NEXT:    vmovx.f16 s22, s14
1239; CHECK-NEXT:    vmov.f32 s2, s28
1240; CHECK-NEXT:    vins.f16 s22, s8
1241; CHECK-NEXT:    vmov.f32 s3, s9
1242; CHECK-NEXT:    vmov.f32 s7, s19
1243; CHECK-NEXT:    vadd.f16 q0, q0, q5
1244; CHECK-NEXT:    vadd.f16 q1, q0, q1
1245; CHECK-NEXT:    vldrw.u32 q0, [r0]
1246; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
1247; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
1248; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1249; CHECK-NEXT:    vmovx.f16 s16, s2
1250; CHECK-NEXT:    vmov.f32 s4, s1
1251; CHECK-NEXT:    vmovx.f16 s20, s11
1252; CHECK-NEXT:    vins.f16 s4, s16
1253; CHECK-NEXT:    vmovx.f16 s16, s13
1254; CHECK-NEXT:    vmov.f32 s5, s12
1255; CHECK-NEXT:    vmovx.f16 s24, s1
1256; CHECK-NEXT:    vins.f16 s5, s16
1257; CHECK-NEXT:    vmov.f32 s19, s10
1258; CHECK-NEXT:    vins.f16 s19, s20
1259; CHECK-NEXT:    vmov.f32 s6, s15
1260; CHECK-NEXT:    vmovx.f16 s20, s8
1261; CHECK-NEXT:    vmov.f32 s28, s14
1262; CHECK-NEXT:    vins.f16 s6, s20
1263; CHECK-NEXT:    vmovx.f16 s20, s15
1264; CHECK-NEXT:    vins.f16 s28, s20
1265; CHECK-NEXT:    vmovx.f16 s20, s0
1266; CHECK-NEXT:    vins.f16 s0, s24
1267; CHECK-NEXT:    vins.f16 s20, s2
1268; CHECK-NEXT:    vmovx.f16 s21, s3
1269; CHECK-NEXT:    vmovx.f16 s26, s12
1270; CHECK-NEXT:    vins.f16 s21, s13
1271; CHECK-NEXT:    vins.f16 s3, s26
1272; CHECK-NEXT:    vmovx.f16 s30, s10
1273; CHECK-NEXT:    vmovx.f16 s23, s9
1274; CHECK-NEXT:    vmov.f32 s18, s8
1275; CHECK-NEXT:    vins.f16 s9, s30
1276; CHECK-NEXT:    vins.f16 s23, s11
1277; CHECK-NEXT:    vmov.f32 s1, s3
1278; CHECK-NEXT:    vmovx.f16 s22, s14
1279; CHECK-NEXT:    vmov.f32 s2, s28
1280; CHECK-NEXT:    vins.f16 s22, s8
1281; CHECK-NEXT:    vmov.f32 s3, s9
1282; CHECK-NEXT:    vmov.f32 s7, s19
1283; CHECK-NEXT:    vadd.f16 q0, q0, q5
1284; CHECK-NEXT:    vadd.f16 q0, q0, q1
1285; CHECK-NEXT:    vstrw.32 q0, [r1]
1286; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1287; CHECK-NEXT:    bx lr
1288entry:
1289  %l1 = load <48 x half>, <48 x half>* %src, align 4
1290  %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1291  %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1292  %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1293  %a1 = fadd <16 x half> %s1, %s2
1294  %a = fadd <16 x half> %a1, %s3
1295  store <16 x half> %a, <16 x half> *%dst
1296  ret void
1297}
1298
1299; f64
1300
1301define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
1302; CHECK-LABEL: vld3_v2f64:
1303; CHECK:       @ %bb.0: @ %entry
1304; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1305; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1306; CHECK-NEXT:    vldrw.u32 q3, [r0]
1307; CHECK-NEXT:    vadd.f64 d4, d3, d0
1308; CHECK-NEXT:    vadd.f64 d5, d6, d7
1309; CHECK-NEXT:    vadd.f64 d1, d4, d1
1310; CHECK-NEXT:    vadd.f64 d0, d5, d2
1311; CHECK-NEXT:    vstrw.32 q0, [r1]
1312; CHECK-NEXT:    bx lr
1313entry:
1314  %l1 = load <6 x double>, <6 x double>* %src, align 4
1315  %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3>
1316  %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4>
1317  %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5>
1318  %a1 = fadd <2 x double> %s1, %s2
1319  %a = fadd <2 x double> %a1, %s3
1320  store <2 x double> %a, <2 x double> *%dst
1321  ret void
1322}
1323
1324define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
1325; CHECK-LABEL: vld3_v4f64:
1326; CHECK:       @ %bb.0: @ %entry
1327; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1328; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1329; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1330; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
1331; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1332; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
1333; CHECK-NEXT:    vadd.f64 d5, d6, d7
1334; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1335; CHECK-NEXT:    vldrw.u32 q6, [r0]
1336; CHECK-NEXT:    vadd.f64 d4, d1, d2
1337; CHECK-NEXT:    vadd.f64 d10, d9, d6
1338; CHECK-NEXT:    vadd.f64 d11, d12, d13
1339; CHECK-NEXT:    vadd.f64 d3, d4, d3
1340; CHECK-NEXT:    vadd.f64 d2, d5, d0
1341; CHECK-NEXT:    vadd.f64 d1, d10, d7
1342; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1343; CHECK-NEXT:    vadd.f64 d0, d11, d8
1344; CHECK-NEXT:    vstrw.32 q0, [r1]
1345; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1346; CHECK-NEXT:    bx lr
1347entry:
1348  %l1 = load <12 x double>, <12 x double>* %src, align 4
1349  %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1350  %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1351  %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1352  %a1 = fadd <4 x double> %s1, %s2
1353  %a = fadd <4 x double> %a1, %s3
1354  store <4 x double> %a, <4 x double> *%dst
1355  ret void
1356}
1357