1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
7; CHECK-LABEL: vst3_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    .save {r4, lr}
10; CHECK-NEXT:    push {r4, lr}
11; CHECK-NEXT:    ldrd lr, r12, [r0]
12; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
13; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
14; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
15; CHECK-NEXT:    vmov.32 q0[0], r4
16; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
17; CHECK-NEXT:    vmov.32 q0[1], r0
18; CHECK-NEXT:    vmov.f32 s8, s7
19; CHECK-NEXT:    vmov.f32 s10, s1
20; CHECK-NEXT:    vmov r2, s8
21; CHECK-NEXT:    vmov.f64 d4, d2
22; CHECK-NEXT:    vmov.f32 s9, s6
23; CHECK-NEXT:    vmov.f32 s10, s0
24; CHECK-NEXT:    vmov.f32 s11, s5
25; CHECK-NEXT:    vstrw.32 q2, [r1]
26; CHECK-NEXT:    strd r2, r0, [r1, #16]
27; CHECK-NEXT:    pop {r4, pc}
28entry:
29  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
30  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
31  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
32  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
33  %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
34  %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
35  %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
36  %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
37  %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
38  store <6 x i32> %s, <6 x i32> *%dst
39  ret void
40}
41
42define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) {
43; CHECK-LABEL: vst3_v4i32:
44; CHECK:       @ %bb.0: @ %entry
45; CHECK-NEXT:    .vsave {d8, d9}
46; CHECK-NEXT:    vpush {d8, d9}
47; CHECK-NEXT:    vldrw.u32 q3, [r0]
48; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
49; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
50; CHECK-NEXT:    vmov.f64 d8, d6
51; CHECK-NEXT:    vmov.f32 s17, s4
52; CHECK-NEXT:    vmov.f32 s8, s5
53; CHECK-NEXT:    vmov.f32 s19, s13
54; CHECK-NEXT:    vmov.f32 s9, s1
55; CHECK-NEXT:    vmov.f32 s18, s0
56; CHECK-NEXT:    vmov.f32 s0, s2
57; CHECK-NEXT:    vstrw.32 q4, [r1]
58; CHECK-NEXT:    vmov.f32 s11, s6
59; CHECK-NEXT:    vmov.f32 s1, s15
60; CHECK-NEXT:    vmov.f32 s10, s14
61; CHECK-NEXT:    vmov.f32 s2, s7
62; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
63; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
64; CHECK-NEXT:    vpop {d8, d9}
65; CHECK-NEXT:    bx lr
66entry:
67  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
68  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
69  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
70  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
71  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
72  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
73  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
74  %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
75  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
76  store <12 x i32> %s, <12 x i32> *%dst
77  ret void
78}
79
80define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) {
81; CHECK-LABEL: vst3_v8i32:
82; CHECK:       @ %bb.0: @ %entry
83; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
84; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
85; CHECK-NEXT:    .pad #16
86; CHECK-NEXT:    sub sp, #16
87; CHECK-NEXT:    vldrw.u32 q4, [r0]
88; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
89; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
90; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
91; CHECK-NEXT:    vmov.f64 d10, d8
92; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
93; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
94; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
95; CHECK-NEXT:    vmov.f32 s21, s28
96; CHECK-NEXT:    vmov.f64 d14, d12
97; CHECK-NEXT:    vmov.f64 d4, d1
98; CHECK-NEXT:    vmov.f32 s29, s12
99; CHECK-NEXT:    vmov.f32 s9, s27
100; CHECK-NEXT:    vmov.f32 s31, s25
101; CHECK-NEXT:    vmov.f32 s11, s3
102; CHECK-NEXT:    vmov.f32 s30, s0
103; CHECK-NEXT:    vmov.f32 s0, s13
104; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
105; CHECK-NEXT:    vmov.f32 s3, s14
106; CHECK-NEXT:    vmov.f32 s2, s26
107; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
108; CHECK-NEXT:    vmov.f32 s10, s15
109; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
110; CHECK-NEXT:    vmov.f32 s23, s17
111; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
112; CHECK-NEXT:    vmov.f32 s12, s25
113; CHECK-NEXT:    vmov.f32 s13, s5
114; CHECK-NEXT:    vmov.f32 s22, s4
115; CHECK-NEXT:    vmov.f32 s4, s6
116; CHECK-NEXT:    vstrw.32 q5, [r1]
117; CHECK-NEXT:    vmov.f32 s15, s26
118; CHECK-NEXT:    vmov.f32 s5, s19
119; CHECK-NEXT:    vmov.f32 s14, s18
120; CHECK-NEXT:    vmov.f32 s6, s27
121; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
122; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
123; CHECK-NEXT:    add sp, #16
124; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
125; CHECK-NEXT:    bx lr
126entry:
127  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
128  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
129  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
130  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
131  %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
132  %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
133  %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
134  %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
135  %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
136  store <24 x i32> %s, <24 x i32> *%dst
137  ret void
138}
139
140define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
141; CHECK-LABEL: vst3_v16i32:
142; CHECK:       @ %bb.0: @ %entry
143; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
144; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
145; CHECK-NEXT:    .pad #160
146; CHECK-NEXT:    sub sp, #160
147; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
148; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
149; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
150; CHECK-NEXT:    vldrw.u32 q6, [r0]
151; CHECK-NEXT:    vstrw.32 q7, [sp, #112] @ 16-byte Spill
152; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
153; CHECK-NEXT:    vmov.f32 s16, s1
154; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
155; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
156; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
157; CHECK-NEXT:    vmov.f32 s17, s9
158; CHECK-NEXT:    vstrw.32 q3, [sp, #128] @ 16-byte Spill
159; CHECK-NEXT:    vmov.f32 s19, s2
160; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
161; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
162; CHECK-NEXT:    vmov.f32 s18, s26
163; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
164; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
165; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
166; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
167; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
168; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
169; CHECK-NEXT:    vmov.f64 d8, d5
170; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
171; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
172; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
173; CHECK-NEXT:    vmov.f32 s17, s27
174; CHECK-NEXT:    vmov.f32 s19, s11
175; CHECK-NEXT:    vmov.f32 s18, s3
176; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
177; CHECK-NEXT:    vmov.f64 d8, d3
178; CHECK-NEXT:    vmov.f32 s17, s31
179; CHECK-NEXT:    vmov.f32 s19, s7
180; CHECK-NEXT:    vmov.f32 s18, s15
181; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
182; CHECK-NEXT:    vmov.f64 d8, d12
183; CHECK-NEXT:    vmov.f32 s17, s0
184; CHECK-NEXT:    vmov.f32 s19, s25
185; CHECK-NEXT:    vmov.f32 s18, s8
186; CHECK-NEXT:    vmov q2, q7
187; CHECK-NEXT:    vmov.f64 d0, d4
188; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
189; CHECK-NEXT:    vmov.f32 s1, s12
190; CHECK-NEXT:    vmov.f32 s3, s9
191; CHECK-NEXT:    vmov.f32 s2, s4
192; CHECK-NEXT:    vmov.f32 s4, s13
193; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
194; CHECK-NEXT:    vmov.f32 s7, s14
195; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
196; CHECK-NEXT:    vmov.f32 s6, s10
197; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
198; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
199; CHECK-NEXT:    vmov.f64 d4, d1
200; CHECK-NEXT:    vmov q3, q1
201; CHECK-NEXT:    vmov.f32 s16, s5
202; CHECK-NEXT:    vmov.f32 s17, s1
203; CHECK-NEXT:    vmov.f32 s19, s6
204; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
205; CHECK-NEXT:    vmov.f64 d12, d11
206; CHECK-NEXT:    vmov q7, q1
207; CHECK-NEXT:    vmov.f32 s9, s7
208; CHECK-NEXT:    vmov.f32 s18, s6
209; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
210; CHECK-NEXT:    vmov.f32 s11, s3
211; CHECK-NEXT:    vmov q0, q7
212; CHECK-NEXT:    vmov.f32 s25, s7
213; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
214; CHECK-NEXT:    vmov.f32 s27, s23
215; CHECK-NEXT:    vldrw.u32 q5, [sp, #112] @ 16-byte Reload
216; CHECK-NEXT:    vmov.f32 s10, s15
217; CHECK-NEXT:    vldrw.u32 q3, [sp, #144] @ 16-byte Reload
218; CHECK-NEXT:    vmov.f32 s29, s20
219; CHECK-NEXT:    vmov q5, q1
220; CHECK-NEXT:    vmov.f32 s31, s1
221; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
222; CHECK-NEXT:    vmov.f32 s26, s15
223; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
224; CHECK-NEXT:    vmov.f32 s30, s0
225; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
226; CHECK-NEXT:    vmov.f64 d0, d2
227; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
228; CHECK-NEXT:    vmov.f32 s1, s12
229; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
230; CHECK-NEXT:    vmov.f32 s3, s5
231; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
232; CHECK-NEXT:    vmov.f32 s2, s12
233; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
234; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
235; CHECK-NEXT:    vmov.f32 s12, s5
236; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
237; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
238; CHECK-NEXT:    vmov.f32 s15, s6
239; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
240; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
241; CHECK-NEXT:    vmov.f32 s14, s22
242; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
243; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
244; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
245; CHECK-NEXT:    vstrw.32 q0, [r1]
246; CHECK-NEXT:    add sp, #160
247; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
248; CHECK-NEXT:    bx lr
249entry:
250  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
251  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
252  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
253  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
254  %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
255  %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
256  %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
257  %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
258  %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
259  store <48 x i32> %s, <48 x i32> *%dst
260  ret void
261}
262
263; i16
264
265define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) {
266; CHECK-LABEL: vst3_v2i16:
267; CHECK:       @ %bb.0: @ %entry
268; CHECK-NEXT:    .save {r4, lr}
269; CHECK-NEXT:    push {r4, lr}
270; CHECK-NEXT:    ldrh r2, [r0, #6]
271; CHECK-NEXT:    ldrh.w lr, [r0, #4]
272; CHECK-NEXT:    ldrh.w r12, [r0, #8]
273; CHECK-NEXT:    vmov.16 q0[4], r2
274; CHECK-NEXT:    ldrh r3, [r0, #2]
275; CHECK-NEXT:    vmov q1[2], q1[0], lr, r2
276; CHECK-NEXT:    ldrh r4, [r0]
277; CHECK-NEXT:    ldrh r0, [r0, #10]
278; CHECK-NEXT:    vmov.16 q0[5], r0
279; CHECK-NEXT:    vmov r0, s2
280; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
281; CHECK-NEXT:    vmov.f32 s1, s4
282; CHECK-NEXT:    vmov.f32 s3, s2
283; CHECK-NEXT:    vmov.32 q0[2], r12
284; CHECK-NEXT:    vstrh.32 q0, [r1]
285; CHECK-NEXT:    str r0, [r1, #8]
286; CHECK-NEXT:    pop {r4, pc}
287entry:
288  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
289  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
290  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
291  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
292  %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
293  %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
294  %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
295  %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
296  %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
297  store <6 x i16> %s, <6 x i16> *%dst
298  ret void
299}
300
301define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
302; CHECK-LABEL: vst3_v4i16:
303; CHECK:       @ %bb.0: @ %entry
304; CHECK-NEXT:    .save {r4, r5, r7, lr}
305; CHECK-NEXT:    push {r4, r5, r7, lr}
306; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
307; CHECK-NEXT:    vldrh.u32 q1, [r0]
308; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
309; CHECK-NEXT:    vmov.f64 d6, d5
310; CHECK-NEXT:    vmov.f32 s13, s7
311; CHECK-NEXT:    vmov r0, r5, d2
312; CHECK-NEXT:    vmov r2, r3, d0
313; CHECK-NEXT:    vmov lr, r4, d1
314; CHECK-NEXT:    vmov.16 q0[0], r0
315; CHECK-NEXT:    vmov.f32 s15, s11
316; CHECK-NEXT:    vmov.16 q0[1], r2
317; CHECK-NEXT:    vmov.32 q3[2], r4
318; CHECK-NEXT:    vmov r0, r4, d4
319; CHECK-NEXT:    vmov.16 q0[2], r0
320; CHECK-NEXT:    vmov r12, s6
321; CHECK-NEXT:    vmov.16 q0[3], r5
322; CHECK-NEXT:    vstrh.32 q3, [r1, #16]
323; CHECK-NEXT:    vmov.16 q0[4], r3
324; CHECK-NEXT:    vmov.16 q0[5], r4
325; CHECK-NEXT:    vmov.16 q0[6], r12
326; CHECK-NEXT:    vmov.16 q0[7], lr
327; CHECK-NEXT:    vstrw.32 q0, [r1]
328; CHECK-NEXT:    pop {r4, r5, r7, pc}
329entry:
330  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
331  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
332  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
333  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
334  %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
335  %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
336  %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
337  %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
338  %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
339  store <12 x i16> %s, <12 x i16> *%dst
340  ret void
341}
342
343define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
344; CHECK-LABEL: vst3_v8i16:
345; CHECK:       @ %bb.0: @ %entry
346; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
347; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
348; CHECK-NEXT:    vldrw.u32 q2, [r0]
349; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
350; CHECK-NEXT:    vmov.f64 d0, d4
351; CHECK-NEXT:    vmov.u16 r2, q1[1]
352; CHECK-NEXT:    vmovx.f16 s20, s8
353; CHECK-NEXT:    vins.f16 s0, s4
354; CHECK-NEXT:    vmov.f32 s12, s9
355; CHECK-NEXT:    vins.f16 s12, s5
356; CHECK-NEXT:    vmov.16 q0[4], r2
357; CHECK-NEXT:    vmov.f32 s3, s12
358; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
359; CHECK-NEXT:    vmov.f32 s1, s8
360; CHECK-NEXT:    vmov.f32 s17, s12
361; CHECK-NEXT:    vmov.f32 s18, s12
362; CHECK-NEXT:    vins.f16 s17, s20
363; CHECK-NEXT:    vmovx.f16 s20, s18
364; CHECK-NEXT:    vins.f16 s2, s20
365; CHECK-NEXT:    vmovx.f16 s20, s14
366; CHECK-NEXT:    vmov.f32 s18, s2
367; CHECK-NEXT:    vmov.f32 s1, s17
368; CHECK-NEXT:    vmov.f32 s2, s18
369; CHECK-NEXT:    vmovx.f16 s16, s6
370; CHECK-NEXT:    vins.f16 s16, s20
371; CHECK-NEXT:    vmovx.f16 s20, s15
372; CHECK-NEXT:    vins.f16 s17, s7
373; CHECK-NEXT:    vstrw.32 q0, [r1]
374; CHECK-NEXT:    vmovx.f16 s19, s7
375; CHECK-NEXT:    vrev32.16 q1, q1
376; CHECK-NEXT:    vins.f16 s19, s20
377; CHECK-NEXT:    vmov.f32 s21, s11
378; CHECK-NEXT:    vmov.f32 s18, s15
379; CHECK-NEXT:    vmovx.f16 s24, s17
380; CHECK-NEXT:    vmov.f32 s22, s11
381; CHECK-NEXT:    vins.f16 s21, s24
382; CHECK-NEXT:    vmovx.f16 s24, s22
383; CHECK-NEXT:    vins.f16 s18, s24
384; CHECK-NEXT:    vmov.f32 s12, s13
385; CHECK-NEXT:    vmov.f32 s22, s18
386; CHECK-NEXT:    vmov.f32 s17, s21
387; CHECK-NEXT:    vmov.f32 s18, s22
388; CHECK-NEXT:    vmovx.f16 s20, s9
389; CHECK-NEXT:    vins.f16 s12, s20
390; CHECK-NEXT:    vmovx.f16 s20, s10
391; CHECK-NEXT:    vins.f16 s14, s20
392; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
393; CHECK-NEXT:    vmov.f32 s15, s14
394; CHECK-NEXT:    vmov.f32 s14, s10
395; CHECK-NEXT:    vmovx.f16 s8, s13
396; CHECK-NEXT:    vins.f16 s5, s8
397; CHECK-NEXT:    vmovx.f16 s8, s6
398; CHECK-NEXT:    vins.f16 s14, s8
399; CHECK-NEXT:    vmov.f32 s6, s14
400; CHECK-NEXT:    vmov.f32 s13, s5
401; CHECK-NEXT:    vmov.f32 s14, s6
402; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
403; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
404; CHECK-NEXT:    bx lr
405entry:
406  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
407  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
408  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
409  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
410  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
411  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
412  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
413  %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
414  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
415  store <24 x i16> %s, <24 x i16> *%dst
416  ret void
417}
418
419define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
420; CHECK-LABEL: vst3_v16i16:
421; CHECK:       @ %bb.0: @ %entry
422; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
423; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
424; CHECK-NEXT:    .pad #80
425; CHECK-NEXT:    sub sp, #80
426; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
427; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
428; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
429; CHECK-NEXT:    vmovx.f16 s0, s14
430; CHECK-NEXT:    vmovx.f16 s8, s22
431; CHECK-NEXT:    vins.f16 s8, s0
432; CHECK-NEXT:    vmovx.f16 s0, s15
433; CHECK-NEXT:    vins.f16 s9, s23
434; CHECK-NEXT:    vmov.u16 r2, q6[1]
435; CHECK-NEXT:    vmovx.f16 s11, s23
436; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
437; CHECK-NEXT:    vins.f16 s11, s0
438; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
439; CHECK-NEXT:    vmov.f32 s10, s15
440; CHECK-NEXT:    vmovx.f16 s4, s9
441; CHECK-NEXT:    vmov q4, q2
442; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
443; CHECK-NEXT:    vmov.f32 s1, s11
444; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
445; CHECK-NEXT:    vmov.f32 s2, s11
446; CHECK-NEXT:    vins.f16 s1, s4
447; CHECK-NEXT:    vmovx.f16 s4, s2
448; CHECK-NEXT:    vins.f16 s18, s4
449; CHECK-NEXT:    vldrw.u32 q1, [r0]
450; CHECK-NEXT:    vmov.f32 s2, s18
451; CHECK-NEXT:    vmov.f64 d4, d2
452; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
453; CHECK-NEXT:    vmovx.f16 s28, s4
454; CHECK-NEXT:    vins.f16 s8, s24
455; CHECK-NEXT:    vmov.f32 s17, s1
456; CHECK-NEXT:    vmov.16 q2[4], r2
457; CHECK-NEXT:    vmov.f32 s11, s5
458; CHECK-NEXT:    vins.f16 s11, s25
459; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
460; CHECK-NEXT:    vmov.f32 s9, s4
461; CHECK-NEXT:    vmov.u16 r0, q5[1]
462; CHECK-NEXT:    vmov.f32 s5, s24
463; CHECK-NEXT:    vmov.f32 s6, s24
464; CHECK-NEXT:    vins.f16 s5, s28
465; CHECK-NEXT:    vmovx.f16 s28, s6
466; CHECK-NEXT:    vins.f16 s10, s28
467; CHECK-NEXT:    vmov.f32 s18, s2
468; CHECK-NEXT:    vmov.f32 s6, s10
469; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
470; CHECK-NEXT:    vmov.f32 s9, s5
471; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
472; CHECK-NEXT:    vmov.f32 s10, s6
473; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
474; CHECK-NEXT:    vstrw.32 q2, [r1]
475; CHECK-NEXT:    vmov.f64 d14, d2
476; CHECK-NEXT:    vins.f16 s28, s20
477; CHECK-NEXT:    vmov.f32 s0, s5
478; CHECK-NEXT:    vins.f16 s0, s21
479; CHECK-NEXT:    vmov.16 q7[4], r0
480; CHECK-NEXT:    vmov.f32 s31, s0
481; CHECK-NEXT:    vldrw.u32 q5, [sp, #64] @ 16-byte Reload
482; CHECK-NEXT:    vmov.f32 s1, s12
483; CHECK-NEXT:    vmov.f32 s29, s4
484; CHECK-NEXT:    vmovx.f16 s4, s4
485; CHECK-NEXT:    vmov.f32 s2, s12
486; CHECK-NEXT:    vins.f16 s1, s4
487; CHECK-NEXT:    vmovx.f16 s4, s2
488; CHECK-NEXT:    vins.f16 s30, s4
489; CHECK-NEXT:    vmovx.f16 s4, s26
490; CHECK-NEXT:    vmov.f32 s2, s30
491; CHECK-NEXT:    vmov.f32 s29, s1
492; CHECK-NEXT:    vmov.f32 s12, s13
493; CHECK-NEXT:    vmov.f32 s30, s2
494; CHECK-NEXT:    vmovx.f16 s0, s18
495; CHECK-NEXT:    vins.f16 s0, s4
496; CHECK-NEXT:    vmov q1, q4
497; CHECK-NEXT:    vins.f16 s1, s7
498; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
499; CHECK-NEXT:    vmovx.f16 s3, s7
500; CHECK-NEXT:    vmovx.f16 s4, s27
501; CHECK-NEXT:    vins.f16 s3, s4
502; CHECK-NEXT:    vmov.f32 s5, s23
503; CHECK-NEXT:    vmov.f32 s2, s27
504; CHECK-NEXT:    vmovx.f16 s16, s1
505; CHECK-NEXT:    vmov.f32 s6, s23
506; CHECK-NEXT:    vins.f16 s5, s16
507; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
508; CHECK-NEXT:    vmovx.f16 s20, s6
509; CHECK-NEXT:    vmov.f32 s24, s25
510; CHECK-NEXT:    vins.f16 s2, s20
511; CHECK-NEXT:    vmovx.f16 s20, s17
512; CHECK-NEXT:    vins.f16 s12, s20
513; CHECK-NEXT:    vmovx.f16 s20, s18
514; CHECK-NEXT:    vins.f16 s14, s20
515; CHECK-NEXT:    vmov.f32 s6, s2
516; CHECK-NEXT:    vmov.f32 s15, s14
517; CHECK-NEXT:    vmov.f32 s14, s18
518; CHECK-NEXT:    vmovx.f16 s16, s13
519; CHECK-NEXT:    vstr s16, [sp, #32] @ 4-byte Spill
520; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
521; CHECK-NEXT:    vmov.f32 s1, s5
522; CHECK-NEXT:    vrev32.16 q5, q4
523; CHECK-NEXT:    vldr s16, [sp, #32] @ 4-byte Reload
524; CHECK-NEXT:    vins.f16 s21, s16
525; CHECK-NEXT:    vmovx.f16 s16, s22
526; CHECK-NEXT:    vins.f16 s14, s16
527; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
528; CHECK-NEXT:    vmov.f32 s2, s6
529; CHECK-NEXT:    vmovx.f16 s4, s17
530; CHECK-NEXT:    vmov.f32 s22, s14
531; CHECK-NEXT:    vins.f16 s24, s4
532; CHECK-NEXT:    vmovx.f16 s4, s18
533; CHECK-NEXT:    vins.f16 s26, s4
534; CHECK-NEXT:    vmov.f32 s13, s21
535; CHECK-NEXT:    vmov.f32 s27, s26
536; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
537; CHECK-NEXT:    vmov.f32 s26, s18
538; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
539; CHECK-NEXT:    vmovx.f16 s4, s25
540; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
541; CHECK-NEXT:    vrev32.16 q4, q4
542; CHECK-NEXT:    vins.f16 s17, s4
543; CHECK-NEXT:    vmovx.f16 s4, s18
544; CHECK-NEXT:    vins.f16 s26, s4
545; CHECK-NEXT:    vmov.f32 s14, s22
546; CHECK-NEXT:    vmov.f32 s18, s26
547; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
548; CHECK-NEXT:    vmov.f32 s25, s17
549; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
550; CHECK-NEXT:    vmov.f32 s26, s18
551; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
552; CHECK-NEXT:    add sp, #80
553; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
554; CHECK-NEXT:    bx lr
555entry:
556  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
557  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
558  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
559  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
560  %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
561  %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
562  %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
563  %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
564  %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
565  store <48 x i16> %s, <48 x i16> *%dst
566  ret void
567}
568
569; i8
570
571define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
572; CHECK-LABEL: vst3_v2i8:
573; CHECK:       @ %bb.0: @ %entry
574; CHECK-NEXT:    .save {r4, r5, r6, lr}
575; CHECK-NEXT:    push {r4, r5, r6, lr}
576; CHECK-NEXT:    .pad #16
577; CHECK-NEXT:    sub sp, #16
578; CHECK-NEXT:    ldrb r2, [r0]
579; CHECK-NEXT:    movs r6, #0
580; CHECK-NEXT:    ldrb r3, [r0, #1]
581; CHECK-NEXT:    ldrb.w r12, [r0, #2]
582; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
583; CHECK-NEXT:    ldrb.w lr, [r0, #3]
584; CHECK-NEXT:    vmov r4, s0
585; CHECK-NEXT:    ldrb r5, [r0, #5]
586; CHECK-NEXT:    vmov.16 q0[0], r4
587; CHECK-NEXT:    ldrb r0, [r0, #4]
588; CHECK-NEXT:    vmov.16 q0[1], r12
589; CHECK-NEXT:    mov r2, sp
590; CHECK-NEXT:    vmov.16 q0[2], r0
591; CHECK-NEXT:    add r0, sp, #8
592; CHECK-NEXT:    vmov.16 q0[3], r3
593; CHECK-NEXT:    vmov.16 q0[4], lr
594; CHECK-NEXT:    vmov.16 q0[5], r5
595; CHECK-NEXT:    vmov.16 q0[6], r6
596; CHECK-NEXT:    vmov.16 q0[7], r6
597; CHECK-NEXT:    vstrb.16 q0, [r2]
598; CHECK-NEXT:    vstrb.16 q0, [r0]
599; CHECK-NEXT:    vldrh.u32 q0, [r0]
600; CHECK-NEXT:    ldr r2, [sp]
601; CHECK-NEXT:    str r2, [r1]
602; CHECK-NEXT:    vmov r0, s2
603; CHECK-NEXT:    strh r0, [r1, #4]
604; CHECK-NEXT:    add sp, #16
605; CHECK-NEXT:    pop {r4, r5, r6, pc}
606entry:
607  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
608  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
609  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
610  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
611  %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
612  %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
613  %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
614  %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
615  %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
616  store <6 x i8> %s, <6 x i8> *%dst
617  ret void
618}
619
620define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) {
621; CHECK-LABEL: vst3_v4i8:
622; CHECK:       @ %bb.0: @ %entry
623; CHECK-NEXT:    .save {r4, r5, r6, lr}
624; CHECK-NEXT:    push {r4, r5, r6, lr}
625; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
626; CHECK-NEXT:    vldrb.u32 q1, [r0]
627; CHECK-NEXT:    vmov r2, lr, d0
628; CHECK-NEXT:    vmov r12, r3, d1
629; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
630; CHECK-NEXT:    vmov r0, r6, d3
631; CHECK-NEXT:    vmov r4, r5, d1
632; CHECK-NEXT:    vmov.8 q2[8], r4
633; CHECK-NEXT:    vmov.8 q2[9], r6
634; CHECK-NEXT:    vmov.8 q2[10], r3
635; CHECK-NEXT:    vmov.8 q2[11], r5
636; CHECK-NEXT:    vmov r3, s10
637; CHECK-NEXT:    str r3, [r1, #8]
638; CHECK-NEXT:    vmov r3, r4, d2
639; CHECK-NEXT:    vmov.16 q1[0], r3
640; CHECK-NEXT:    vmov r3, r5, d0
641; CHECK-NEXT:    vmov.16 q1[1], r2
642; CHECK-NEXT:    vmov.16 q1[2], r3
643; CHECK-NEXT:    vmov.16 q1[3], r4
644; CHECK-NEXT:    vmov.16 q1[4], lr
645; CHECK-NEXT:    vmov.16 q1[5], r5
646; CHECK-NEXT:    vmov.16 q1[6], r0
647; CHECK-NEXT:    vmov.16 q1[7], r12
648; CHECK-NEXT:    vstrb.16 q1, [r1]
649; CHECK-NEXT:    pop {r4, r5, r6, pc}
650entry:
651  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
652  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
653  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
654  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
655  %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
656  %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
657  %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
658  %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
659  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
660  store <12 x i8> %s, <12 x i8> *%dst
661  ret void
662}
663
664define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
665; CHECK-LABEL: vst3_v8i8:
666; CHECK:       @ %bb.0: @ %entry
667; CHECK-NEXT:    .vsave {d8, d9, d10}
668; CHECK-NEXT:    vpush {d8, d9, d10}
669; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
670; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
671; CHECK-NEXT:    vmovx.f16 s12, s6
672; CHECK-NEXT:    vmovx.f16 s0, s10
673; CHECK-NEXT:    vins.f16 s0, s12
674; CHECK-NEXT:    vmovx.f16 s12, s7
675; CHECK-NEXT:    vins.f16 s1, s11
676; CHECK-NEXT:    vmovx.f16 s3, s11
677; CHECK-NEXT:    vins.f16 s3, s12
678; CHECK-NEXT:    vldrb.u16 q3, [r0]
679; CHECK-NEXT:    vmov.f32 s2, s7
680; CHECK-NEXT:    vmovx.f16 s20, s1
681; CHECK-NEXT:    vmov.f32 s17, s15
682; CHECK-NEXT:    vmov.u16 r0, q3[0]
683; CHECK-NEXT:    vmov.f32 s18, s15
684; CHECK-NEXT:    vins.f16 s17, s20
685; CHECK-NEXT:    vmovx.f16 s20, s18
686; CHECK-NEXT:    vins.f16 s2, s20
687; CHECK-NEXT:    vmov.f32 s18, s2
688; CHECK-NEXT:    vmov.f32 s1, s17
689; CHECK-NEXT:    vmov.f32 s2, s18
690; CHECK-NEXT:    vmov.8 q4[0], r0
691; CHECK-NEXT:    vmov.u16 r0, q2[0]
692; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
693; CHECK-NEXT:    vmov.8 q4[1], r0
694; CHECK-NEXT:    vmov.u16 r0, q1[0]
695; CHECK-NEXT:    vmov.8 q4[2], r0
696; CHECK-NEXT:    vmov.u16 r0, q3[1]
697; CHECK-NEXT:    vmov.8 q4[3], r0
698; CHECK-NEXT:    vmov.u16 r0, q2[1]
699; CHECK-NEXT:    vmov.8 q4[4], r0
700; CHECK-NEXT:    vmov.u16 r0, q1[1]
701; CHECK-NEXT:    vmov.8 q4[5], r0
702; CHECK-NEXT:    vmov.u16 r0, q3[2]
703; CHECK-NEXT:    vmov.8 q4[6], r0
704; CHECK-NEXT:    vmov.u16 r0, q2[2]
705; CHECK-NEXT:    vmov.8 q4[7], r0
706; CHECK-NEXT:    vmov.u16 r0, q1[2]
707; CHECK-NEXT:    vmov.8 q4[8], r0
708; CHECK-NEXT:    vmov.u16 r0, q3[3]
709; CHECK-NEXT:    vmov.8 q4[9], r0
710; CHECK-NEXT:    vmov.u16 r0, q2[3]
711; CHECK-NEXT:    vmov.8 q4[10], r0
712; CHECK-NEXT:    vmov.u16 r0, q1[3]
713; CHECK-NEXT:    vmov.8 q4[11], r0
714; CHECK-NEXT:    vmov.u16 r0, q3[4]
715; CHECK-NEXT:    vmov.8 q4[12], r0
716; CHECK-NEXT:    vmov.u16 r0, q2[4]
717; CHECK-NEXT:    vmov.8 q4[13], r0
718; CHECK-NEXT:    vmov.u16 r0, q1[4]
719; CHECK-NEXT:    vmov.8 q4[14], r0
720; CHECK-NEXT:    vmov.u16 r0, q3[5]
721; CHECK-NEXT:    vmov.8 q4[15], r0
722; CHECK-NEXT:    vstrw.32 q4, [r1]
723; CHECK-NEXT:    vpop {d8, d9, d10}
724; CHECK-NEXT:    bx lr
725entry:
726  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
727  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
728  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
729  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
730  %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
731  %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
732  %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
733  %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
734  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
735  store <24 x i8> %s, <24 x i8> *%dst
736  ret void
737}
738
739define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) {
740; CHECK-LABEL: vst3_v16i8:
741; CHECK:       @ %bb.0: @ %entry
742; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
743; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
744; CHECK-NEXT:    vldrw.u32 q3, [r0]
745; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
746; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
747; CHECK-NEXT:    vmov.u8 r3, q3[0]
748; CHECK-NEXT:    vmov.u8 r0, q2[0]
749; CHECK-NEXT:    vmov.8 q5[0], r3
750; CHECK-NEXT:    vmov.u8 r2, q1[0]
751; CHECK-NEXT:    vmov.8 q5[1], r0
752; CHECK-NEXT:    vmov.u8 r0, q3[1]
753; CHECK-NEXT:    vmov.8 q5[3], r0
754; CHECK-NEXT:    vmov.u8 r0, q2[1]
755; CHECK-NEXT:    vmov.8 q5[4], r0
756; CHECK-NEXT:    vmov.u8 r0, q3[2]
757; CHECK-NEXT:    vmov.8 q5[6], r0
758; CHECK-NEXT:    vmov.u8 r0, q2[2]
759; CHECK-NEXT:    vmov.8 q5[7], r0
760; CHECK-NEXT:    vmov.u8 r0, q3[3]
761; CHECK-NEXT:    vmov.8 q5[9], r0
762; CHECK-NEXT:    vmov.u8 r0, q2[3]
763; CHECK-NEXT:    vmov.8 q5[10], r0
764; CHECK-NEXT:    vmov.u8 r0, q3[4]
765; CHECK-NEXT:    vmov.8 q4[2], r2
766; CHECK-NEXT:    vmov.u8 r2, q1[2]
767; CHECK-NEXT:    vmov.8 q5[12], r0
768; CHECK-NEXT:    vmov.u8 r0, q2[4]
769; CHECK-NEXT:    vmov.8 q4[8], r2
770; CHECK-NEXT:    vmov.u8 r2, q1[3]
771; CHECK-NEXT:    vmov.8 q5[13], r0
772; CHECK-NEXT:    vmov.u8 r0, q3[5]
773; CHECK-NEXT:    vmov.8 q5[15], r0
774; CHECK-NEXT:    vmov.8 q4[11], r2
775; CHECK-NEXT:    vmov.u8 r2, q1[4]
776; CHECK-NEXT:    vmov.u8 r0, q5[0]
777; CHECK-NEXT:    vmov.8 q4[14], r2
778; CHECK-NEXT:    vmov.8 q0[0], r0
779; CHECK-NEXT:    vmov.f32 s17, s4
780; CHECK-NEXT:    vmov.u8 r0, q5[1]
781; CHECK-NEXT:    vmov.8 q0[1], r0
782; CHECK-NEXT:    vmov.u8 r2, q4[2]
783; CHECK-NEXT:    vmov.8 q0[2], r2
784; CHECK-NEXT:    vmov.u8 r0, q5[3]
785; CHECK-NEXT:    vmov.8 q0[3], r0
786; CHECK-NEXT:    vmov.u8 r0, q5[4]
787; CHECK-NEXT:    vmov.8 q0[4], r0
788; CHECK-NEXT:    vmov.u8 r0, q4[5]
789; CHECK-NEXT:    vmov.8 q0[5], r0
790; CHECK-NEXT:    vmov.u8 r0, q5[6]
791; CHECK-NEXT:    vmov.8 q0[6], r0
792; CHECK-NEXT:    vmov.u8 r0, q5[7]
793; CHECK-NEXT:    vmov.8 q0[7], r0
794; CHECK-NEXT:    vmov.u8 r0, q4[8]
795; CHECK-NEXT:    vmov.8 q0[8], r0
796; CHECK-NEXT:    vmov.u8 r0, q5[9]
797; CHECK-NEXT:    vmov.8 q0[9], r0
798; CHECK-NEXT:    vmov.u8 r0, q5[10]
799; CHECK-NEXT:    vmov.8 q0[10], r0
800; CHECK-NEXT:    vmov.u8 r0, q4[11]
801; CHECK-NEXT:    vmov.8 q0[11], r0
802; CHECK-NEXT:    vmov.u8 r0, q5[12]
803; CHECK-NEXT:    vmov.8 q0[12], r0
804; CHECK-NEXT:    vmov.u8 r0, q5[13]
805; CHECK-NEXT:    vmov.8 q0[13], r0
806; CHECK-NEXT:    vmov.u8 r0, q4[14]
807; CHECK-NEXT:    vmov.8 q0[14], r0
808; CHECK-NEXT:    vmov.u8 r0, q5[15]
809; CHECK-NEXT:    vmov.8 q0[15], r0
810; CHECK-NEXT:    vmov.u8 r0, q2[5]
811; CHECK-NEXT:    vmov.8 q5[0], r0
812; CHECK-NEXT:    vmov.u8 r0, q1[5]
813; CHECK-NEXT:    vmov.8 q5[1], r0
814; CHECK-NEXT:    vmov.u8 r0, q2[6]
815; CHECK-NEXT:    vmov.8 q5[3], r0
816; CHECK-NEXT:    vmov.u8 r0, q1[6]
817; CHECK-NEXT:    vmov.8 q5[4], r0
818; CHECK-NEXT:    vmov.u8 r0, q2[7]
819; CHECK-NEXT:    vmov.8 q5[6], r0
820; CHECK-NEXT:    vmov.u8 r0, q1[7]
821; CHECK-NEXT:    vmov.8 q5[7], r0
822; CHECK-NEXT:    vmov.u8 r0, q2[8]
823; CHECK-NEXT:    vmov.8 q5[9], r0
824; CHECK-NEXT:    vmov.u8 r0, q1[8]
825; CHECK-NEXT:    vmov.8 q5[10], r0
826; CHECK-NEXT:    vmov.u8 r0, q2[9]
827; CHECK-NEXT:    vmov.8 q5[12], r0
828; CHECK-NEXT:    vmov.u8 r0, q1[9]
829; CHECK-NEXT:    vmov.8 q5[13], r0
830; CHECK-NEXT:    vmov.u8 r0, q2[10]
831; CHECK-NEXT:    vmov.8 q5[15], r0
832; CHECK-NEXT:    vstrw.32 q0, [r1]
833; CHECK-NEXT:    vmov.u8 r0, q5[0]
834; CHECK-NEXT:    vmov.8 q4[0], r0
835; CHECK-NEXT:    vmov.u8 r0, q5[1]
836; CHECK-NEXT:    vmov.8 q4[1], r0
837; CHECK-NEXT:    vmov.u8 r0, q3[7]
838; CHECK-NEXT:    vmov.8 q6[5], r0
839; CHECK-NEXT:    vmov.u8 r0, q3[8]
840; CHECK-NEXT:    vmov.8 q6[8], r0
841; CHECK-NEXT:    vmov.u8 r0, q3[9]
842; CHECK-NEXT:    vmov.8 q6[11], r0
843; CHECK-NEXT:    vmov.f32 s24, s13
844; CHECK-NEXT:    vmov.f32 s27, s14
845; CHECK-NEXT:    vmov.u8 r0, q6[2]
846; CHECK-NEXT:    vmov.8 q4[2], r0
847; CHECK-NEXT:    vmov.u8 r0, q5[3]
848; CHECK-NEXT:    vmov.8 q4[3], r0
849; CHECK-NEXT:    vmov.u8 r0, q5[4]
850; CHECK-NEXT:    vmov.8 q4[4], r0
851; CHECK-NEXT:    vmov.u8 r0, q6[5]
852; CHECK-NEXT:    vmov.8 q4[5], r0
853; CHECK-NEXT:    vmov.u8 r0, q5[6]
854; CHECK-NEXT:    vmov.8 q4[6], r0
855; CHECK-NEXT:    vmov.u8 r0, q5[7]
856; CHECK-NEXT:    vmov.8 q4[7], r0
857; CHECK-NEXT:    vmov.u8 r0, q6[8]
858; CHECK-NEXT:    vmov.8 q4[8], r0
859; CHECK-NEXT:    vmov.u8 r0, q5[9]
860; CHECK-NEXT:    vmov.8 q4[9], r0
861; CHECK-NEXT:    vmov.u8 r0, q5[10]
862; CHECK-NEXT:    vmov.8 q4[10], r0
863; CHECK-NEXT:    vmov.u8 r0, q6[11]
864; CHECK-NEXT:    vmov.8 q4[11], r0
865; CHECK-NEXT:    vmov.u8 r0, q5[12]
866; CHECK-NEXT:    vmov.8 q4[12], r0
867; CHECK-NEXT:    vmov.u8 r0, q5[13]
868; CHECK-NEXT:    vmov.8 q4[13], r0
869; CHECK-NEXT:    vmov.u8 r0, q6[14]
870; CHECK-NEXT:    vmov.8 q4[14], r0
871; CHECK-NEXT:    vmov.u8 r0, q5[15]
872; CHECK-NEXT:    vmov.8 q4[15], r0
873; CHECK-NEXT:    vmov.u8 r0, q1[10]
874; CHECK-NEXT:    vmov.8 q5[0], r0
875; CHECK-NEXT:    vmov.u8 r0, q3[11]
876; CHECK-NEXT:    vmov.8 q5[1], r0
877; CHECK-NEXT:    vmov.u8 r0, q1[11]
878; CHECK-NEXT:    vmov.8 q5[3], r0
879; CHECK-NEXT:    vmov.u8 r0, q3[12]
880; CHECK-NEXT:    vmov.8 q5[4], r0
881; CHECK-NEXT:    vmov.u8 r0, q1[12]
882; CHECK-NEXT:    vmov.8 q5[6], r0
883; CHECK-NEXT:    vmov.u8 r0, q3[13]
884; CHECK-NEXT:    vmov.8 q5[7], r0
885; CHECK-NEXT:    vmov.u8 r0, q1[13]
886; CHECK-NEXT:    vmov.8 q5[9], r0
887; CHECK-NEXT:    vmov.u8 r0, q3[14]
888; CHECK-NEXT:    vmov.8 q5[10], r0
889; CHECK-NEXT:    vmov.u8 r0, q1[14]
890; CHECK-NEXT:    vmov.8 q5[12], r0
891; CHECK-NEXT:    vmov.u8 r0, q3[15]
892; CHECK-NEXT:    vmov.8 q5[13], r0
893; CHECK-NEXT:    vmov.u8 r0, q1[15]
894; CHECK-NEXT:    vmov.8 q5[15], r0
895; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
896; CHECK-NEXT:    vmov.u8 r0, q5[0]
897; CHECK-NEXT:    vmov.8 q1[0], r0
898; CHECK-NEXT:    vmov.u8 r0, q5[1]
899; CHECK-NEXT:    vmov.8 q1[1], r0
900; CHECK-NEXT:    vmov.u8 r0, q2[11]
901; CHECK-NEXT:    vmov.8 q3[2], r0
902; CHECK-NEXT:    vmov.u8 r0, q2[12]
903; CHECK-NEXT:    vmov.8 q3[5], r0
904; CHECK-NEXT:    vmov.u8 r0, q2[13]
905; CHECK-NEXT:    vmov.8 q3[8], r0
906; CHECK-NEXT:    vmov.u8 r0, q2[14]
907; CHECK-NEXT:    vmov.8 q3[11], r0
908; CHECK-NEXT:    vmov.u8 r0, q2[15]
909; CHECK-NEXT:    vmov.8 q3[14], r0
910; CHECK-NEXT:    vmov.u8 r0, q3[2]
911; CHECK-NEXT:    vmov.8 q1[2], r0
912; CHECK-NEXT:    vmov.u8 r0, q5[3]
913; CHECK-NEXT:    vmov.8 q1[3], r0
914; CHECK-NEXT:    vmov.u8 r0, q5[4]
915; CHECK-NEXT:    vmov.8 q1[4], r0
916; CHECK-NEXT:    vmov.u8 r0, q3[5]
917; CHECK-NEXT:    vmov.8 q1[5], r0
918; CHECK-NEXT:    vmov.u8 r0, q5[6]
919; CHECK-NEXT:    vmov.8 q1[6], r0
920; CHECK-NEXT:    vmov.u8 r0, q5[7]
921; CHECK-NEXT:    vmov.8 q1[7], r0
922; CHECK-NEXT:    vmov.u8 r0, q3[8]
923; CHECK-NEXT:    vmov.8 q1[8], r0
924; CHECK-NEXT:    vmov.u8 r0, q5[9]
925; CHECK-NEXT:    vmov.8 q1[9], r0
926; CHECK-NEXT:    vmov.u8 r0, q5[10]
927; CHECK-NEXT:    vmov.8 q1[10], r0
928; CHECK-NEXT:    vmov.u8 r0, q3[11]
929; CHECK-NEXT:    vmov.8 q1[11], r0
930; CHECK-NEXT:    vmov.u8 r0, q5[12]
931; CHECK-NEXT:    vmov.8 q1[12], r0
932; CHECK-NEXT:    vmov.u8 r0, q5[13]
933; CHECK-NEXT:    vmov.8 q1[13], r0
934; CHECK-NEXT:    vmov.u8 r0, q3[14]
935; CHECK-NEXT:    vmov.8 q1[14], r0
936; CHECK-NEXT:    vmov.u8 r0, q5[15]
937; CHECK-NEXT:    vmov.8 q1[15], r0
938; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
939; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
940; CHECK-NEXT:    bx lr
941entry:
942  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
943  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
944  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
945  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
946  %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
947  %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
948  %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
949  %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
950  %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
951  store <48 x i8> %s, <48 x i8> *%dst
952  ret void
953}
954
955; i64
956
957define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
958; CHECK-LABEL: vst3_v2i64:
959; CHECK:       @ %bb.0: @ %entry
960; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
961; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
962; CHECK-NEXT:    vldrw.u32 q1, [r0]
963; CHECK-NEXT:    vmov.f64 d6, d5
964; CHECK-NEXT:    vmov.f32 s13, s11
965; CHECK-NEXT:    vmov.f32 s14, s2
966; CHECK-NEXT:    vmov.f32 s15, s3
967; CHECK-NEXT:    vmov.f32 s2, s6
968; CHECK-NEXT:    vmov.f32 s3, s7
969; CHECK-NEXT:    vmov.f32 s6, s8
970; CHECK-NEXT:    vmov.f32 s7, s9
971; CHECK-NEXT:    vstrb.8 q1, [r1], #32
972; CHECK-NEXT:    vstrw.32 q3, [r1]
973; CHECK-NEXT:    vstrw.32 q0, [r1, #-16]
974; CHECK-NEXT:    bx lr
975entry:
976  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
977  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
978  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
979  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
980  %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
981  %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
982  %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
983  %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
984  %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
985  store <6 x i64> %s, <6 x i64> *%dst
986  ret void
987}
988
989define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) {
990; CHECK-LABEL: vst3_v4i64:
991; CHECK:       @ %bb.0: @ %entry
992; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
993; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
994; CHECK-NEXT:    .pad #16
995; CHECK-NEXT:    sub sp, #16
996; CHECK-NEXT:    vldrw.u32 q1, [r0]
997; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
998; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
999; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1000; CHECK-NEXT:    vmov.f64 d10, d2
1001; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
1002; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1003; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
1004; CHECK-NEXT:    vmov.f32 s21, s5
1005; CHECK-NEXT:    vmov.f32 s22, s28
1006; CHECK-NEXT:    vmov.f32 s23, s29
1007; CHECK-NEXT:    vmov.f64 d14, d12
1008; CHECK-NEXT:    vstrw.32 q5, [r1]
1009; CHECK-NEXT:    vmov.f32 s29, s25
1010; CHECK-NEXT:    vmov.f64 d8, d7
1011; CHECK-NEXT:    vmov.f32 s30, s12
1012; CHECK-NEXT:    vmov.f32 s17, s15
1013; CHECK-NEXT:    vmov.f32 s31, s13
1014; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
1015; CHECK-NEXT:    vmov.f32 s18, s2
1016; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
1017; CHECK-NEXT:    vmov.f32 s4, s8
1018; CHECK-NEXT:    vmov.f32 s19, s3
1019; CHECK-NEXT:    vmov.f32 s2, s26
1020; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
1021; CHECK-NEXT:    vmov.f32 s5, s9
1022; CHECK-NEXT:    vmov.f32 s8, s14
1023; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1024; CHECK-NEXT:    vmov.f32 s3, s27
1025; CHECK-NEXT:    vmov.f32 s9, s15
1026; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1027; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
1028; CHECK-NEXT:    add sp, #16
1029; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1030; CHECK-NEXT:    bx lr
1031entry:
1032  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
1033  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
1034  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
1035  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
1036  %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
1037  %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
1038  %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1039  %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1040  %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1041  store <12 x i64> %s, <12 x i64> *%dst
1042  ret void
1043}
1044
1045; f32
1046
1047define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) {
1048; CHECK-LABEL: vst3_v2f32:
1049; CHECK:       @ %bb.0: @ %entry
1050; CHECK-NEXT:    vldr s0, [r0]
1051; CHECK-NEXT:    vldr s3, [r0, #4]
1052; CHECK-NEXT:    vldr s1, [r0, #8]
1053; CHECK-NEXT:    ldr r2, [r0, #20]
1054; CHECK-NEXT:    vldr s2, [r0, #16]
1055; CHECK-NEXT:    ldr r0, [r0, #12]
1056; CHECK-NEXT:    strd r0, r2, [r1, #16]
1057; CHECK-NEXT:    vstrw.32 q0, [r1]
1058; CHECK-NEXT:    bx lr
1059entry:
1060  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
1061  %l1 = load <2 x float>, <2 x float>* %s1, align 4
1062  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
1063  %l2 = load <2 x float>, <2 x float>* %s2, align 4
1064  %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
1065  %l3 = load <2 x float>, <2 x float>* %s3, align 4
1066  %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1067  %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1068  %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1069  store <6 x float> %s, <6 x float> *%dst
1070  ret void
1071}
1072
1073define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) {
1074; CHECK-LABEL: vst3_v4f32:
1075; CHECK:       @ %bb.0: @ %entry
1076; CHECK-NEXT:    .vsave {d8, d9}
1077; CHECK-NEXT:    vpush {d8, d9}
1078; CHECK-NEXT:    vldrw.u32 q3, [r0]
1079; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1080; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
1081; CHECK-NEXT:    vmov.f64 d8, d6
1082; CHECK-NEXT:    vmov.f32 s17, s0
1083; CHECK-NEXT:    vmov.f32 s8, s1
1084; CHECK-NEXT:    vmov.f32 s19, s13
1085; CHECK-NEXT:    vmov.f32 s9, s5
1086; CHECK-NEXT:    vmov.f32 s18, s4
1087; CHECK-NEXT:    vmov.f32 s4, s6
1088; CHECK-NEXT:    vstrw.32 q4, [r1]
1089; CHECK-NEXT:    vmov.f32 s11, s2
1090; CHECK-NEXT:    vmov.f32 s5, s15
1091; CHECK-NEXT:    vmov.f32 s10, s14
1092; CHECK-NEXT:    vmov.f32 s6, s3
1093; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
1094; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1095; CHECK-NEXT:    vpop {d8, d9}
1096; CHECK-NEXT:    bx lr
1097entry:
1098  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
1099  %l1 = load <4 x float>, <4 x float>* %s1, align 4
1100  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
1101  %l2 = load <4 x float>, <4 x float>* %s2, align 4
1102  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
1103  %l3 = load <4 x float>, <4 x float>* %s3, align 4
1104  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1105  %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1106  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1107  store <12 x float> %s, <12 x float> *%dst
1108  ret void
1109}
1110
1111define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) {
1112; CHECK-LABEL: vst3_v8f32:
1113; CHECK:       @ %bb.0: @ %entry
1114; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1115; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1116; CHECK-NEXT:    .pad #16
1117; CHECK-NEXT:    sub sp, #16
1118; CHECK-NEXT:    vldrw.u32 q4, [r0]
1119; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
1120; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1121; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1122; CHECK-NEXT:    vmov.f64 d10, d8
1123; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
1124; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
1125; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
1126; CHECK-NEXT:    vmov.f32 s21, s24
1127; CHECK-NEXT:    vmov.f64 d12, d4
1128; CHECK-NEXT:    vmov.f64 d6, d1
1129; CHECK-NEXT:    vmov.f32 s25, s28
1130; CHECK-NEXT:    vmov.f32 s13, s11
1131; CHECK-NEXT:    vmov.f32 s27, s9
1132; CHECK-NEXT:    vmov.f32 s15, s3
1133; CHECK-NEXT:    vmov.f32 s26, s0
1134; CHECK-NEXT:    vmov.f32 s0, s29
1135; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
1136; CHECK-NEXT:    vmov.f32 s3, s30
1137; CHECK-NEXT:    vmov.f32 s14, s31
1138; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
1139; CHECK-NEXT:    vmov.f32 s23, s17
1140; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
1141; CHECK-NEXT:    vmov.f32 s2, s10
1142; CHECK-NEXT:    vmov.f32 s8, s29
1143; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1144; CHECK-NEXT:    vmov.f32 s9, s5
1145; CHECK-NEXT:    vmov.f32 s22, s4
1146; CHECK-NEXT:    vmov.f32 s4, s6
1147; CHECK-NEXT:    vstrw.32 q5, [r1]
1148; CHECK-NEXT:    vmov.f32 s11, s30
1149; CHECK-NEXT:    vmov.f32 s5, s19
1150; CHECK-NEXT:    vmov.f32 s10, s18
1151; CHECK-NEXT:    vmov.f32 s6, s31
1152; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
1153; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1154; CHECK-NEXT:    add sp, #16
1155; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1156; CHECK-NEXT:    bx lr
1157entry:
1158  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
1159  %l1 = load <8 x float>, <8 x float>* %s1, align 4
1160  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
1161  %l2 = load <8 x float>, <8 x float>* %s2, align 4
1162  %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
1163  %l3 = load <8 x float>, <8 x float>* %s3, align 4
1164  %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1165  %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1166  %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1167  store <24 x float> %s, <24 x float> *%dst
1168  ret void
1169}
1170
1171define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) {
1172; CHECK-LABEL: vst3_v16f32:
1173; CHECK:       @ %bb.0: @ %entry
1174; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1175; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1176; CHECK-NEXT:    .pad #160
1177; CHECK-NEXT:    sub sp, #160
1178; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1179; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
1180; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
1181; CHECK-NEXT:    vldrw.u32 q6, [r0]
1182; CHECK-NEXT:    vmov.f32 s12, s1
1183; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
1184; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
1185; CHECK-NEXT:    vmov.f32 s13, s9
1186; CHECK-NEXT:    vmov.f32 s15, s2
1187; CHECK-NEXT:    vldrw.u32 q4, [r0, #160]
1188; CHECK-NEXT:    vstrw.32 q5, [sp, #112] @ 16-byte Spill
1189; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
1190; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
1191; CHECK-NEXT:    vmov.f32 s14, s26
1192; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
1193; CHECK-NEXT:    vldrw.u32 q4, [r0, #144]
1194; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
1195; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
1196; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
1197; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
1198; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
1199; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
1200; CHECK-NEXT:    vmov.f64 d6, d5
1201; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
1202; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
1203; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
1204; CHECK-NEXT:    vmov.f32 s13, s27
1205; CHECK-NEXT:    vmov.f32 s15, s11
1206; CHECK-NEXT:    vmov.f32 s14, s3
1207; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
1208; CHECK-NEXT:    vmov.f64 d6, d3
1209; CHECK-NEXT:    vmov.f32 s13, s23
1210; CHECK-NEXT:    vmov.f32 s15, s7
1211; CHECK-NEXT:    vmov.f32 s14, s31
1212; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
1213; CHECK-NEXT:    vmov.f64 d6, d12
1214; CHECK-NEXT:    vmov.f32 s13, s0
1215; CHECK-NEXT:    vmov.f32 s15, s25
1216; CHECK-NEXT:    vmov.f32 s14, s8
1217; CHECK-NEXT:    vmov q2, q7
1218; CHECK-NEXT:    vmov.f64 d0, d10
1219; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
1220; CHECK-NEXT:    vmov.f32 s1, s8
1221; CHECK-NEXT:    vmov.f32 s3, s21
1222; CHECK-NEXT:    vmov.f32 s2, s4
1223; CHECK-NEXT:    vmov.f32 s4, s9
1224; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
1225; CHECK-NEXT:    vmov.f32 s7, s10
1226; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
1227; CHECK-NEXT:    vmov.f32 s6, s22
1228; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
1229; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
1230; CHECK-NEXT:    vmov.f64 d4, d1
1231; CHECK-NEXT:    vmov q3, q1
1232; CHECK-NEXT:    vmov.f32 s20, s5
1233; CHECK-NEXT:    vmov.f32 s21, s1
1234; CHECK-NEXT:    vmov.f32 s23, s6
1235; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
1236; CHECK-NEXT:    vmov.f64 d12, d9
1237; CHECK-NEXT:    vmov q7, q1
1238; CHECK-NEXT:    vmov.f32 s9, s7
1239; CHECK-NEXT:    vmov.f32 s22, s6
1240; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
1241; CHECK-NEXT:    vmov.f32 s11, s3
1242; CHECK-NEXT:    vmov q0, q7
1243; CHECK-NEXT:    vmov.f32 s25, s7
1244; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
1245; CHECK-NEXT:    vmov.f32 s27, s19
1246; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
1247; CHECK-NEXT:    vmov.f32 s10, s15
1248; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
1249; CHECK-NEXT:    vmov.f32 s29, s16
1250; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
1251; CHECK-NEXT:    vmov.f32 s31, s1
1252; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
1253; CHECK-NEXT:    vmov.f32 s26, s15
1254; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
1255; CHECK-NEXT:    vmov.f32 s30, s0
1256; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
1257; CHECK-NEXT:    vmov.f64 d0, d2
1258; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
1259; CHECK-NEXT:    vmov.f32 s1, s12
1260; CHECK-NEXT:    vmov.f32 s3, s5
1261; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
1262; CHECK-NEXT:    vmov.f32 s2, s16
1263; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
1264; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
1265; CHECK-NEXT:    vmov.f32 s16, s13
1266; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
1267; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1268; CHECK-NEXT:    vmov.f32 s19, s14
1269; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
1270; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
1271; CHECK-NEXT:    vmov.f32 s18, s6
1272; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
1273; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
1274; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
1275; CHECK-NEXT:    vstrw.32 q0, [r1]
1276; CHECK-NEXT:    add sp, #160
1277; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1278; CHECK-NEXT:    bx lr
1279entry:
1280  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
1281  %l1 = load <16 x float>, <16 x float>* %s1, align 4
1282  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
1283  %l2 = load <16 x float>, <16 x float>* %s2, align 4
1284  %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
1285  %l3 = load <16 x float>, <16 x float>* %s3, align 4
1286  %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1287  %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1288  %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1289  store <48 x float> %s, <48 x float> *%dst
1290  ret void
1291}
1292
1293; f16
1294
1295define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
1296; CHECK-LABEL: vst3_v2f16:
1297; CHECK:       @ %bb.0: @ %entry
1298; CHECK-NEXT:    vldmia r0, {s0, s1}
1299; CHECK-NEXT:    ldr r0, [r0, #8]
1300; CHECK-NEXT:    vmovx.f16 s8, s0
1301; CHECK-NEXT:    vins.f16 s0, s1
1302; CHECK-NEXT:    vmov.32 q1[0], r0
1303; CHECK-NEXT:    vmovx.f16 s2, s1
1304; CHECK-NEXT:    vmovx.f16 s10, s4
1305; CHECK-NEXT:    vins.f16 s4, s8
1306; CHECK-NEXT:    vins.f16 s2, s10
1307; CHECK-NEXT:    vmov.f32 s1, s4
1308; CHECK-NEXT:    vmov r3, s2
1309; CHECK-NEXT:    vmov r0, r2, d0
1310; CHECK-NEXT:    stm r1!, {r0, r2, r3}
1311; CHECK-NEXT:    bx lr
1312entry:
1313  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
1314  %l1 = load <2 x half>, <2 x half>* %s1, align 4
1315  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
1316  %l2 = load <2 x half>, <2 x half>* %s2, align 4
1317  %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
1318  %l3 = load <2 x half>, <2 x half>* %s3, align 4
1319  %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1320  %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1321  %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1322  store <6 x half> %s, <6 x half> *%dst
1323  ret void
1324}
1325
1326define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
1327; CHECK-LABEL: vst3_v4f16:
1328; CHECK:       @ %bb.0: @ %entry
1329; CHECK-NEXT:    .save {r7, lr}
1330; CHECK-NEXT:    push {r7, lr}
1331; CHECK-NEXT:    .vsave {d8, d9}
1332; CHECK-NEXT:    vpush {d8, d9}
1333; CHECK-NEXT:    ldrd r2, r12, [r0]
1334; CHECK-NEXT:    ldrd r3, lr, [r0, #8]
1335; CHECK-NEXT:    vmov.32 q0[0], r2
1336; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
1337; CHECK-NEXT:    vmov.32 q1[0], r3
1338; CHECK-NEXT:    vmov.32 q0[1], r12
1339; CHECK-NEXT:    vmov.32 q1[1], lr
1340; CHECK-NEXT:    vmov.f32 s2, s4
1341; CHECK-NEXT:    vmov.f32 s3, s5
1342; CHECK-NEXT:    vmovx.f16 s10, s0
1343; CHECK-NEXT:    vmov.f32 s8, s1
1344; CHECK-NEXT:    vins.f16 s0, s2
1345; CHECK-NEXT:    vins.f16 s8, s5
1346; CHECK-NEXT:    vmov.32 q1[0], r2
1347; CHECK-NEXT:    vmov.32 q1[1], r0
1348; CHECK-NEXT:    vmovx.f16 s2, s2
1349; CHECK-NEXT:    vmovx.f16 s12, s4
1350; CHECK-NEXT:    vins.f16 s4, s10
1351; CHECK-NEXT:    vins.f16 s2, s12
1352; CHECK-NEXT:    vmovx.f16 s10, s1
1353; CHECK-NEXT:    vmovx.f16 s12, s5
1354; CHECK-NEXT:    vmovx.f16 s17, s3
1355; CHECK-NEXT:    vins.f16 s5, s10
1356; CHECK-NEXT:    vins.f16 s17, s12
1357; CHECK-NEXT:    vmov.f32 s16, s5
1358; CHECK-NEXT:    vmov.f32 s1, s4
1359; CHECK-NEXT:    vmov.f32 s3, s8
1360; CHECK-NEXT:    vstrw.32 q0, [r1]
1361; CHECK-NEXT:    vmov r0, r2, d8
1362; CHECK-NEXT:    strd r0, r2, [r1, #16]
1363; CHECK-NEXT:    vpop {d8, d9}
1364; CHECK-NEXT:    pop {r7, pc}
1365entry:
1366  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
1367  %l1 = load <4 x half>, <4 x half>* %s1, align 4
1368  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
1369  %l2 = load <4 x half>, <4 x half>* %s2, align 4
1370  %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
1371  %l3 = load <4 x half>, <4 x half>* %s3, align 4
1372  %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1373  %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1374  %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1375  store <12 x half> %s, <12 x half> *%dst
1376  ret void
1377}
1378
1379define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
1380; CHECK-LABEL: vst3_v8f16:
1381; CHECK:       @ %bb.0: @ %entry
1382; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
1383; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
1384; CHECK-NEXT:    vldrw.u32 q2, [r0]
1385; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
1386; CHECK-NEXT:    vmov.f64 d0, d4
1387; CHECK-NEXT:    vmovx.f16 s6, s20
1388; CHECK-NEXT:    vmovx.f16 s12, s8
1389; CHECK-NEXT:    vmov.f32 s4, s9
1390; CHECK-NEXT:    vins.f16 s0, s20
1391; CHECK-NEXT:    vmov r2, s6
1392; CHECK-NEXT:    vins.f16 s4, s21
1393; CHECK-NEXT:    vmov.16 q0[4], r2
1394; CHECK-NEXT:    vmov.f32 s3, s4
1395; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
1396; CHECK-NEXT:    vmov.f32 s1, s8
1397; CHECK-NEXT:    vmov.f32 s17, s4
1398; CHECK-NEXT:    vmovx.f16 s24, s7
1399; CHECK-NEXT:    vmov.f32 s18, s4
1400; CHECK-NEXT:    vins.f16 s17, s12
1401; CHECK-NEXT:    vmovx.f16 s12, s18
1402; CHECK-NEXT:    vins.f16 s2, s12
1403; CHECK-NEXT:    vmovx.f16 s15, s23
1404; CHECK-NEXT:    vins.f16 s15, s24
1405; CHECK-NEXT:    vmovx.f16 s24, s6
1406; CHECK-NEXT:    vmovx.f16 s12, s22
1407; CHECK-NEXT:    vmov.f32 s18, s2
1408; CHECK-NEXT:    vins.f16 s12, s24
1409; CHECK-NEXT:    vmov.f32 s25, s11
1410; CHECK-NEXT:    vins.f16 s13, s23
1411; CHECK-NEXT:    vmov.f32 s26, s11
1412; CHECK-NEXT:    vmov.f32 s14, s7
1413; CHECK-NEXT:    vmovx.f16 s28, s13
1414; CHECK-NEXT:    vins.f16 s25, s28
1415; CHECK-NEXT:    vmovx.f16 s28, s26
1416; CHECK-NEXT:    vins.f16 s14, s28
1417; CHECK-NEXT:    vmovx.f16 s28, s9
1418; CHECK-NEXT:    vmov.f32 s4, s5
1419; CHECK-NEXT:    vrev32.16 q5, q5
1420; CHECK-NEXT:    vins.f16 s4, s28
1421; CHECK-NEXT:    vmovx.f16 s28, s10
1422; CHECK-NEXT:    vins.f16 s6, s28
1423; CHECK-NEXT:    vmov.f32 s26, s14
1424; CHECK-NEXT:    vmov.f32 s7, s6
1425; CHECK-NEXT:    vmov.f32 s6, s10
1426; CHECK-NEXT:    vmovx.f16 s8, s5
1427; CHECK-NEXT:    vins.f16 s21, s8
1428; CHECK-NEXT:    vmovx.f16 s8, s22
1429; CHECK-NEXT:    vins.f16 s6, s8
1430; CHECK-NEXT:    vmov.f32 s1, s17
1431; CHECK-NEXT:    vmov.f32 s22, s6
1432; CHECK-NEXT:    vmov.f32 s13, s25
1433; CHECK-NEXT:    vmov.f32 s5, s21
1434; CHECK-NEXT:    vmov.f32 s2, s18
1435; CHECK-NEXT:    vmov.f32 s14, s26
1436; CHECK-NEXT:    vstrw.32 q0, [r1]
1437; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
1438; CHECK-NEXT:    vmov.f32 s6, s22
1439; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1440; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
1441; CHECK-NEXT:    bx lr
1442entry:
1443  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
1444  %l1 = load <8 x half>, <8 x half>* %s1, align 4
1445  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
1446  %l2 = load <8 x half>, <8 x half>* %s2, align 4
1447  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
1448  %l3 = load <8 x half>, <8 x half>* %s3, align 4
1449  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1450  %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1451  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1452  store <24 x half> %s, <24 x half> *%dst
1453  ret void
1454}
1455
1456define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
1457; CHECK-LABEL: vst3_v16f16:
1458; CHECK:       @ %bb.0: @ %entry
1459; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1460; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1461; CHECK-NEXT:    .pad #128
1462; CHECK-NEXT:    sub sp, #128
1463; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1464; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
1465; CHECK-NEXT:    vldrw.u32 q6, [r0]
1466; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
1467; CHECK-NEXT:    vmovx.f16 s0, s19
1468; CHECK-NEXT:    vmovx.f16 s7, s15
1469; CHECK-NEXT:    vins.f16 s7, s0
1470; CHECK-NEXT:    vmovx.f16 s0, s18
1471; CHECK-NEXT:    vmovx.f16 s4, s14
1472; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
1473; CHECK-NEXT:    vins.f16 s4, s0
1474; CHECK-NEXT:    vmov.f64 d14, d12
1475; CHECK-NEXT:    vins.f16 s5, s15
1476; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
1477; CHECK-NEXT:    vmov.f32 s6, s19
1478; CHECK-NEXT:    vmovx.f16 s0, s5
1479; CHECK-NEXT:    vmov q2, q1
1480; CHECK-NEXT:    vmov.f32 s5, s27
1481; CHECK-NEXT:    vmov.f32 s6, s27
1482; CHECK-NEXT:    vins.f16 s28, s12
1483; CHECK-NEXT:    vins.f16 s5, s0
1484; CHECK-NEXT:    vmovx.f16 s0, s6
1485; CHECK-NEXT:    vins.f16 s10, s0
1486; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
1487; CHECK-NEXT:    vmov.f64 d2, d10
1488; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
1489; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
1490; CHECK-NEXT:    vmovx.f16 s2, s8
1491; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
1492; CHECK-NEXT:    vmov.f32 s0, s21
1493; CHECK-NEXT:    vins.f16 s4, s8
1494; CHECK-NEXT:    vmov r2, s2
1495; CHECK-NEXT:    vins.f16 s0, s9
1496; CHECK-NEXT:    vmov.16 q1[4], r2
1497; CHECK-NEXT:    vmovx.f16 s2, s12
1498; CHECK-NEXT:    vmov.f32 s7, s0
1499; CHECK-NEXT:    vmovx.f16 s0, s20
1500; CHECK-NEXT:    vmov.f32 s5, s20
1501; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
1502; CHECK-NEXT:    vmov r0, s2
1503; CHECK-NEXT:    vmov.f32 s9, s20
1504; CHECK-NEXT:    vmov.16 q7[4], r0
1505; CHECK-NEXT:    vmov.f32 s10, s20
1506; CHECK-NEXT:    vins.f16 s9, s0
1507; CHECK-NEXT:    vmovx.f16 s0, s10
1508; CHECK-NEXT:    vins.f16 s6, s0
1509; CHECK-NEXT:    vmov.f32 s0, s25
1510; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
1511; CHECK-NEXT:    vmov q2, q4
1512; CHECK-NEXT:    vins.f16 s0, s13
1513; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
1514; CHECK-NEXT:    vmov.f32 s5, s8
1515; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
1516; CHECK-NEXT:    vmov.f32 s31, s0
1517; CHECK-NEXT:    vmovx.f16 s0, s24
1518; CHECK-NEXT:    vmov.f32 s6, s8
1519; CHECK-NEXT:    vins.f16 s5, s0
1520; CHECK-NEXT:    vmov.f32 s29, s24
1521; CHECK-NEXT:    vmovx.f16 s0, s6
1522; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
1523; CHECK-NEXT:    vins.f16 s30, s0
1524; CHECK-NEXT:    vmovx.f16 s0, s22
1525; CHECK-NEXT:    vmovx.f16 s4, s14
1526; CHECK-NEXT:    vmov.f32 s8, s9
1527; CHECK-NEXT:    vins.f16 s4, s0
1528; CHECK-NEXT:    vmovx.f16 s0, s23
1529; CHECK-NEXT:    vmovx.f16 s7, s15
1530; CHECK-NEXT:    vins.f16 s7, s0
1531; CHECK-NEXT:    vins.f16 s5, s15
1532; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
1533; CHECK-NEXT:    vmov.f32 s6, s23
1534; CHECK-NEXT:    vmovx.f16 s16, s5
1535; CHECK-NEXT:    vmov.f32 s1, s15
1536; CHECK-NEXT:    vmov.f32 s2, s15
1537; CHECK-NEXT:    vins.f16 s1, s16
1538; CHECK-NEXT:    vmovx.f16 s16, s2
1539; CHECK-NEXT:    vins.f16 s6, s16
1540; CHECK-NEXT:    vmovx.f16 s16, s13
1541; CHECK-NEXT:    vmov.f32 s20, s21
1542; CHECK-NEXT:    vins.f16 s20, s16
1543; CHECK-NEXT:    vmovx.f16 s16, s14
1544; CHECK-NEXT:    vins.f16 s22, s16
1545; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
1546; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
1547; CHECK-NEXT:    vmov.f32 s23, s22
1548; CHECK-NEXT:    vmov.f32 s14, s18
1549; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
1550; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
1551; CHECK-NEXT:    vmov.f32 s14, s30
1552; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
1553; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
1554; CHECK-NEXT:    vmov.f32 s2, s6
1555; CHECK-NEXT:    vmov.f32 s22, s14
1556; CHECK-NEXT:    vmovx.f16 s12, s21
1557; CHECK-NEXT:    vstr s12, [sp, #64] @ 4-byte Spill
1558; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
1559; CHECK-NEXT:    vmov.f32 s5, s1
1560; CHECK-NEXT:    vrev32.16 q4, q3
1561; CHECK-NEXT:    vldr s12, [sp, #64] @ 4-byte Reload
1562; CHECK-NEXT:    vins.f16 s17, s12
1563; CHECK-NEXT:    vmovx.f16 s12, s18
1564; CHECK-NEXT:    vins.f16 s22, s12
1565; CHECK-NEXT:    vmovx.f16 s12, s25
1566; CHECK-NEXT:    vmov.f32 s6, s2
1567; CHECK-NEXT:    vins.f16 s8, s12
1568; CHECK-NEXT:    vmovx.f16 s0, s26
1569; CHECK-NEXT:    vmov.f32 s18, s22
1570; CHECK-NEXT:    vins.f16 s10, s0
1571; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
1572; CHECK-NEXT:    vmov.f32 s11, s10
1573; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
1574; CHECK-NEXT:    vmov.f32 s10, s26
1575; CHECK-NEXT:    vrev32.16 q6, q0
1576; CHECK-NEXT:    vmovx.f16 s12, s9
1577; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
1578; CHECK-NEXT:    vins.f16 s25, s12
1579; CHECK-NEXT:    vmovx.f16 s12, s26
1580; CHECK-NEXT:    vins.f16 s10, s12
1581; CHECK-NEXT:    vmov.f32 s29, s1
1582; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
1583; CHECK-NEXT:    vmov.f32 s30, s2
1584; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
1585; CHECK-NEXT:    vmov.f32 s26, s10
1586; CHECK-NEXT:    vmov.f32 s1, s13
1587; CHECK-NEXT:    vstrw.32 q7, [r1]
1588; CHECK-NEXT:    vmov.f32 s2, s14
1589; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
1590; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
1591; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1592; CHECK-NEXT:    vmov.f32 s2, s14
1593; CHECK-NEXT:    vmov.f32 s13, s1
1594; CHECK-NEXT:    vmov.f32 s21, s17
1595; CHECK-NEXT:    vmov.f32 s9, s25
1596; CHECK-NEXT:    vmov.f32 s22, s18
1597; CHECK-NEXT:    vmov.f32 s10, s26
1598; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
1599; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
1600; CHECK-NEXT:    vmov.f32 s14, s2
1601; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
1602; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
1603; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
1604; CHECK-NEXT:    add sp, #128
1605; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1606; CHECK-NEXT:    bx lr
1607entry:
1608  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
1609  %l1 = load <16 x half>, <16 x half>* %s1, align 4
1610  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
1611  %l2 = load <16 x half>, <16 x half>* %s2, align 4
1612  %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
1613  %l3 = load <16 x half>, <16 x half>* %s3, align 4
1614  %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1615  %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1616  %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1617  store <48 x half> %s, <48 x half> *%dst
1618  ret void
1619}
1620
1621; f64
1622
1623define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) {
1624; CHECK-LABEL: vst3_v2f64:
1625; CHECK:       @ %bb.0: @ %entry
1626; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
1627; CHECK-NEXT:    vldrw.u32 q0, [r0]
1628; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1629; CHECK-NEXT:    vmov.f64 d6, d2
1630; CHECK-NEXT:    vmov.f64 d7, d1
1631; CHECK-NEXT:    vmov.f64 d1, d4
1632; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
1633; CHECK-NEXT:    vmov.f64 d2, d5
1634; CHECK-NEXT:    vstrw.32 q0, [r1]
1635; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
1636; CHECK-NEXT:    bx lr
1637entry:
1638  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
1639  %l1 = load <2 x double>, <2 x double>* %s1, align 4
1640  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
1641  %l2 = load <2 x double>, <2 x double>* %s2, align 4
1642  %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
1643  %l3 = load <2 x double>, <2 x double>* %s3, align 4
1644  %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1645  %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1646  %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1647  store <6 x double> %s, <6 x double> *%dst
1648  ret void
1649}
1650
1651define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) {
1652; CHECK-LABEL: vst3_v4f64:
1653; CHECK:       @ %bb.0: @ %entry
1654; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1655; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1656; CHECK-NEXT:    .pad #16
1657; CHECK-NEXT:    sub sp, #16
1658; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
1659; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
1660; CHECK-NEXT:    vldrw.u32 q1, [r0]
1661; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
1662; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1663; CHECK-NEXT:    vmov.f64 d6, d15
1664; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
1665; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
1666; CHECK-NEXT:    vmov.f64 d10, d2
1667; CHECK-NEXT:    vmov.f64 d7, d1
1668; CHECK-NEXT:    vmov.f64 d11, d12
1669; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
1670; CHECK-NEXT:    vmov.f64 d12, d4
1671; CHECK-NEXT:    vstrw.32 q5, [r1]
1672; CHECK-NEXT:    vmov.f64 d1, d5
1673; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
1674; CHECK-NEXT:    vmov.f64 d2, d8
1675; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
1676; CHECK-NEXT:    vmov.f64 d13, d14
1677; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1678; CHECK-NEXT:    vmov.f64 d8, d5
1679; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
1680; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
1681; CHECK-NEXT:    add sp, #16
1682; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1683; CHECK-NEXT:    bx lr
1684entry:
1685  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
1686  %l1 = load <4 x double>, <4 x double>* %s1, align 4
1687  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
1688  %l2 = load <4 x double>, <4 x double>* %s2, align 4
1689  %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
1690  %l3 = load <4 x double>, <4 x double>* %s3, align 4
1691  %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1692  %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1693  %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1694  store <12 x double> %s, <12 x double> *%dst
1695  ret void
1696}
1697