1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) { 7; CHECK-LABEL: vst3_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r4, lr} 10; CHECK-NEXT: push {r4, lr} 11; CHECK-NEXT: ldrd lr, r12, [r0] 12; CHECK-NEXT: ldrd r3, r2, [r0, #8] 13; CHECK-NEXT: ldrd r4, r0, [r0, #16] 14; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 15; CHECK-NEXT: vmov.32 q0[0], r4 16; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 17; CHECK-NEXT: vmov.32 q0[1], r0 18; CHECK-NEXT: vmov.f32 s8, s7 19; CHECK-NEXT: vmov.f32 s10, s1 20; CHECK-NEXT: vmov r2, s8 21; CHECK-NEXT: vmov.f64 d4, d2 22; CHECK-NEXT: vmov.f32 s9, s6 23; CHECK-NEXT: vmov.f32 s10, s0 24; CHECK-NEXT: vmov.f32 s11, s5 25; CHECK-NEXT: vstrw.32 q2, [r1] 26; CHECK-NEXT: strd r2, r0, [r1, #16] 27; CHECK-NEXT: pop {r4, pc} 28entry: 29 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 30 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4 31 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1 32 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4 33 %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2 34 %l3 = load <2 x i32>, <2 x i32>* %s3, align 4 35 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 36 %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 37 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 38 store <6 x i32> %s, <6 x i32> *%dst 39 ret void 40} 41 42define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) { 43; CHECK-LABEL: vst3_v4i32: 44; CHECK: @ %bb.0: @ %entry 45; CHECK-NEXT: .vsave {d8, d9} 46; CHECK-NEXT: vpush {d8, d9} 47; CHECK-NEXT: vldrw.u32 q3, [r0] 48; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 49; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 50; CHECK-NEXT: vmov.f64 d8, d6 51; CHECK-NEXT: vmov.f32 s17, s4 52; CHECK-NEXT: vmov.f32 s8, s5 53; CHECK-NEXT: vmov.f32 s19, s13 54; CHECK-NEXT: vmov.f32 s9, s1 55; CHECK-NEXT: vmov.f32 s18, s0 56; CHECK-NEXT: vmov.f32 s0, s2 57; CHECK-NEXT: vstrw.32 q4, [r1] 58; CHECK-NEXT: vmov.f32 s11, s6 59; CHECK-NEXT: vmov.f32 s1, s15 60; CHECK-NEXT: vmov.f32 s10, s14 61; CHECK-NEXT: vmov.f32 s2, s7 62; CHECK-NEXT: vstrw.32 q2, [r1, #16] 63; CHECK-NEXT: vstrw.32 q0, [r1, #32] 64; CHECK-NEXT: vpop {d8, d9} 65; CHECK-NEXT: bx lr 66entry: 67 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 68 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4 69 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1 70 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4 71 %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2 72 %l3 = load <4 x i32>, <4 x i32>* %s3, align 4 73 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 74 %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 75 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 76 store <12 x i32> %s, <12 x i32> *%dst 77 ret void 78} 79 80define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) { 81; CHECK-LABEL: vst3_v8i32: 82; CHECK: @ %bb.0: @ %entry 83; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 84; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 85; CHECK-NEXT: .pad #16 86; CHECK-NEXT: sub sp, #16 87; CHECK-NEXT: vldrw.u32 q4, [r0] 88; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 89; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 90; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 91; CHECK-NEXT: vmov.f64 d10, d8 92; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 93; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 94; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 95; CHECK-NEXT: vmov.f32 s21, s28 96; CHECK-NEXT: vmov.f64 d14, d12 97; CHECK-NEXT: vmov.f64 d4, d1 98; CHECK-NEXT: vmov.f32 s29, s12 99; CHECK-NEXT: vmov.f32 s9, s27 100; CHECK-NEXT: vmov.f32 s31, s25 101; CHECK-NEXT: vmov.f32 s11, s3 102; CHECK-NEXT: vmov.f32 s30, s0 103; CHECK-NEXT: vmov.f32 s0, s13 104; CHECK-NEXT: vstrw.32 q7, [r1, #48] 105; CHECK-NEXT: vmov.f32 s3, s14 106; CHECK-NEXT: vmov.f32 s2, s26 107; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload 108; CHECK-NEXT: vmov.f32 s10, s15 109; CHECK-NEXT: vstrw.32 q0, [r1, #64] 110; CHECK-NEXT: vmov.f32 s23, s17 111; CHECK-NEXT: vstrw.32 q2, [r1, #80] 112; CHECK-NEXT: vmov.f32 s12, s25 113; CHECK-NEXT: vmov.f32 s13, s5 114; CHECK-NEXT: vmov.f32 s22, s4 115; CHECK-NEXT: vmov.f32 s4, s6 116; CHECK-NEXT: vstrw.32 q5, [r1] 117; CHECK-NEXT: vmov.f32 s15, s26 118; CHECK-NEXT: vmov.f32 s5, s19 119; CHECK-NEXT: vmov.f32 s14, s18 120; CHECK-NEXT: vmov.f32 s6, s27 121; CHECK-NEXT: vstrw.32 q3, [r1, #16] 122; CHECK-NEXT: vstrw.32 q1, [r1, #32] 123; CHECK-NEXT: add sp, #16 124; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 125; CHECK-NEXT: bx lr 126entry: 127 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 128 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4 129 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1 130 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4 131 %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2 132 %l3 = load <8 x i32>, <8 x i32>* %s3, align 4 133 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 134 %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 135 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 136 store <24 x i32> %s, <24 x i32> *%dst 137 ret void 138} 139 140define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) { 141; CHECK-LABEL: vst3_v16i32: 142; CHECK: @ %bb.0: @ %entry 143; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 144; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 145; CHECK-NEXT: .pad #160 146; CHECK-NEXT: sub sp, #160 147; CHECK-NEXT: vldrw.u32 q7, [r0, #96] 148; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 149; CHECK-NEXT: vldrw.u32 q2, [r0, #128] 150; CHECK-NEXT: vldrw.u32 q6, [r0] 151; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill 152; CHECK-NEXT: vldrw.u32 q7, [r0, #80] 153; CHECK-NEXT: vmov.f32 s16, s1 154; CHECK-NEXT: vldrw.u32 q3, [r0, #160] 155; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill 156; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 157; CHECK-NEXT: vmov.f32 s17, s9 158; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill 159; CHECK-NEXT: vmov.f32 s19, s2 160; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill 161; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 162; CHECK-NEXT: vmov.f32 s18, s26 163; CHECK-NEXT: vldrw.u32 q5, [r0, #144] 164; CHECK-NEXT: vldrw.u32 q1, [r0, #176] 165; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill 166; CHECK-NEXT: vldrw.u32 q7, [r0, #16] 167; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 168; CHECK-NEXT: vstrw.32 q4, [r1, #16] 169; CHECK-NEXT: vmov.f64 d8, d5 170; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill 171; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 172; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill 173; CHECK-NEXT: vmov.f32 s17, s27 174; CHECK-NEXT: vmov.f32 s19, s11 175; CHECK-NEXT: vmov.f32 s18, s3 176; CHECK-NEXT: vstrw.32 q4, [r1, #32] 177; CHECK-NEXT: vmov.f64 d8, d3 178; CHECK-NEXT: vmov.f32 s17, s31 179; CHECK-NEXT: vmov.f32 s19, s7 180; CHECK-NEXT: vmov.f32 s18, s15 181; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill 182; CHECK-NEXT: vmov.f64 d8, d12 183; CHECK-NEXT: vmov.f32 s17, s0 184; CHECK-NEXT: vmov.f32 s19, s25 185; CHECK-NEXT: vmov.f32 s18, s8 186; CHECK-NEXT: vmov q2, q7 187; CHECK-NEXT: vmov.f64 d0, d4 188; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill 189; CHECK-NEXT: vmov.f32 s1, s12 190; CHECK-NEXT: vmov.f32 s3, s9 191; CHECK-NEXT: vmov.f32 s2, s4 192; CHECK-NEXT: vmov.f32 s4, s13 193; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 194; CHECK-NEXT: vmov.f32 s7, s14 195; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 196; CHECK-NEXT: vmov.f32 s6, s10 197; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill 198; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload 199; CHECK-NEXT: vmov.f64 d4, d1 200; CHECK-NEXT: vmov q3, q1 201; CHECK-NEXT: vmov.f32 s16, s5 202; CHECK-NEXT: vmov.f32 s17, s1 203; CHECK-NEXT: vmov.f32 s19, s6 204; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 205; CHECK-NEXT: vmov.f64 d12, d11 206; CHECK-NEXT: vmov q7, q1 207; CHECK-NEXT: vmov.f32 s9, s7 208; CHECK-NEXT: vmov.f32 s18, s6 209; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload 210; CHECK-NEXT: vmov.f32 s11, s3 211; CHECK-NEXT: vmov q0, q7 212; CHECK-NEXT: vmov.f32 s25, s7 213; CHECK-NEXT: vstrw.32 q4, [r1, #112] 214; CHECK-NEXT: vmov.f32 s27, s23 215; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload 216; CHECK-NEXT: vmov.f32 s10, s15 217; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload 218; CHECK-NEXT: vmov.f32 s29, s20 219; CHECK-NEXT: vmov q5, q1 220; CHECK-NEXT: vmov.f32 s31, s1 221; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 222; CHECK-NEXT: vmov.f32 s26, s15 223; CHECK-NEXT: vstrw.32 q2, [r1, #128] 224; CHECK-NEXT: vmov.f32 s30, s0 225; CHECK-NEXT: vstrw.32 q6, [r1, #80] 226; CHECK-NEXT: vmov.f64 d0, d2 227; CHECK-NEXT: vstrw.32 q7, [r1, #96] 228; CHECK-NEXT: vmov.f32 s1, s12 229; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload 230; CHECK-NEXT: vmov.f32 s3, s5 231; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 232; CHECK-NEXT: vmov.f32 s2, s12 233; CHECK-NEXT: vstrw.32 q0, [r1, #48] 234; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 235; CHECK-NEXT: vmov.f32 s12, s5 236; CHECK-NEXT: vstrw.32 q0, [r1, #144] 237; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 238; CHECK-NEXT: vmov.f32 s15, s6 239; CHECK-NEXT: vstrw.32 q0, [r1, #160] 240; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 241; CHECK-NEXT: vmov.f32 s14, s22 242; CHECK-NEXT: vstrw.32 q0, [r1, #176] 243; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 244; CHECK-NEXT: vstrw.32 q3, [r1, #64] 245; CHECK-NEXT: vstrw.32 q0, [r1] 246; CHECK-NEXT: add sp, #160 247; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 248; CHECK-NEXT: bx lr 249entry: 250 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0 251 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4 252 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1 253 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4 254 %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2 255 %l3 = load <16 x i32>, <16 x i32>* %s3, align 4 256 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 257 %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 258 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 259 store <48 x i32> %s, <48 x i32> *%dst 260 ret void 261} 262 263; i16 264 265define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) { 266; CHECK-LABEL: vst3_v2i16: 267; CHECK: @ %bb.0: @ %entry 268; CHECK-NEXT: .save {r4, lr} 269; CHECK-NEXT: push {r4, lr} 270; CHECK-NEXT: ldrh r2, [r0, #6] 271; CHECK-NEXT: ldrh.w lr, [r0, #4] 272; CHECK-NEXT: ldrh.w r12, [r0, #8] 273; CHECK-NEXT: vmov.16 q0[4], r2 274; CHECK-NEXT: ldrh r3, [r0, #2] 275; CHECK-NEXT: vmov q1[2], q1[0], lr, r2 276; CHECK-NEXT: ldrh r4, [r0] 277; CHECK-NEXT: ldrh r0, [r0, #10] 278; CHECK-NEXT: vmov.16 q0[5], r0 279; CHECK-NEXT: vmov r0, s2 280; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 281; CHECK-NEXT: vmov.f32 s1, s4 282; CHECK-NEXT: vmov.f32 s3, s2 283; CHECK-NEXT: vmov.32 q0[2], r12 284; CHECK-NEXT: vstrh.32 q0, [r1] 285; CHECK-NEXT: str r0, [r1, #8] 286; CHECK-NEXT: pop {r4, pc} 287entry: 288 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0 289 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4 290 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1 291 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4 292 %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2 293 %l3 = load <2 x i16>, <2 x i16>* %s3, align 4 294 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 295 %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 296 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 297 store <6 x i16> %s, <6 x i16> *%dst 298 ret void 299} 300 301define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) { 302; CHECK-LABEL: vst3_v4i16: 303; CHECK: @ %bb.0: @ %entry 304; CHECK-NEXT: .save {r4, r5, r7, lr} 305; CHECK-NEXT: push {r4, r5, r7, lr} 306; CHECK-NEXT: vldrh.u32 q2, [r0, #16] 307; CHECK-NEXT: vldrh.u32 q1, [r0] 308; CHECK-NEXT: vldrh.u32 q0, [r0, #8] 309; CHECK-NEXT: vmov.f64 d6, d5 310; CHECK-NEXT: vmov.f32 s13, s7 311; CHECK-NEXT: vmov r0, r5, d2 312; CHECK-NEXT: vmov r2, r3, d0 313; CHECK-NEXT: vmov lr, r4, d1 314; CHECK-NEXT: vmov.16 q0[0], r0 315; CHECK-NEXT: vmov.f32 s15, s11 316; CHECK-NEXT: vmov.16 q0[1], r2 317; CHECK-NEXT: vmov.32 q3[2], r4 318; CHECK-NEXT: vmov r0, r4, d4 319; CHECK-NEXT: vmov.16 q0[2], r0 320; CHECK-NEXT: vmov r12, s6 321; CHECK-NEXT: vmov.16 q0[3], r5 322; CHECK-NEXT: vstrh.32 q3, [r1, #16] 323; CHECK-NEXT: vmov.16 q0[4], r3 324; CHECK-NEXT: vmov.16 q0[5], r4 325; CHECK-NEXT: vmov.16 q0[6], r12 326; CHECK-NEXT: vmov.16 q0[7], lr 327; CHECK-NEXT: vstrw.32 q0, [r1] 328; CHECK-NEXT: pop {r4, r5, r7, pc} 329entry: 330 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 331 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 332 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1 333 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4 334 %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2 335 %l3 = load <4 x i16>, <4 x i16>* %s3, align 4 336 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 337 %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 338 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 339 store <12 x i16> %s, <12 x i16> *%dst 340 ret void 341} 342 343define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { 344; CHECK-LABEL: vst3_v8i16: 345; CHECK: @ %bb.0: @ %entry 346; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} 347; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} 348; CHECK-NEXT: vldrw.u32 q2, [r0] 349; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 350; CHECK-NEXT: vmov.f64 d0, d4 351; CHECK-NEXT: vmov.u16 r2, q1[1] 352; CHECK-NEXT: vmovx.f16 s20, s8 353; CHECK-NEXT: vins.f16 s0, s4 354; CHECK-NEXT: vmov.f32 s12, s9 355; CHECK-NEXT: vins.f16 s12, s5 356; CHECK-NEXT: vmov.16 q0[4], r2 357; CHECK-NEXT: vmov.f32 s3, s12 358; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 359; CHECK-NEXT: vmov.f32 s1, s8 360; CHECK-NEXT: vmov.f32 s17, s12 361; CHECK-NEXT: vmov.f32 s18, s12 362; CHECK-NEXT: vins.f16 s17, s20 363; CHECK-NEXT: vmovx.f16 s20, s18 364; CHECK-NEXT: vins.f16 s2, s20 365; CHECK-NEXT: vmovx.f16 s20, s14 366; CHECK-NEXT: vmov.f32 s18, s2 367; CHECK-NEXT: vmov.f32 s1, s17 368; CHECK-NEXT: vmov.f32 s2, s18 369; CHECK-NEXT: vmovx.f16 s16, s6 370; CHECK-NEXT: vins.f16 s16, s20 371; CHECK-NEXT: vmovx.f16 s20, s15 372; CHECK-NEXT: vins.f16 s17, s7 373; CHECK-NEXT: vstrw.32 q0, [r1] 374; CHECK-NEXT: vmovx.f16 s19, s7 375; CHECK-NEXT: vrev32.16 q1, q1 376; CHECK-NEXT: vins.f16 s19, s20 377; CHECK-NEXT: vmov.f32 s21, s11 378; CHECK-NEXT: vmov.f32 s18, s15 379; CHECK-NEXT: vmovx.f16 s24, s17 380; CHECK-NEXT: vmov.f32 s22, s11 381; CHECK-NEXT: vins.f16 s21, s24 382; CHECK-NEXT: vmovx.f16 s24, s22 383; CHECK-NEXT: vins.f16 s18, s24 384; CHECK-NEXT: vmov.f32 s12, s13 385; CHECK-NEXT: vmov.f32 s22, s18 386; CHECK-NEXT: vmov.f32 s17, s21 387; CHECK-NEXT: vmov.f32 s18, s22 388; CHECK-NEXT: vmovx.f16 s20, s9 389; CHECK-NEXT: vins.f16 s12, s20 390; CHECK-NEXT: vmovx.f16 s20, s10 391; CHECK-NEXT: vins.f16 s14, s20 392; CHECK-NEXT: vstrw.32 q4, [r1, #32] 393; CHECK-NEXT: vmov.f32 s15, s14 394; CHECK-NEXT: vmov.f32 s14, s10 395; CHECK-NEXT: vmovx.f16 s8, s13 396; CHECK-NEXT: vins.f16 s5, s8 397; CHECK-NEXT: vmovx.f16 s8, s6 398; CHECK-NEXT: vins.f16 s14, s8 399; CHECK-NEXT: vmov.f32 s6, s14 400; CHECK-NEXT: vmov.f32 s13, s5 401; CHECK-NEXT: vmov.f32 s14, s6 402; CHECK-NEXT: vstrw.32 q3, [r1, #16] 403; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} 404; CHECK-NEXT: bx lr 405entry: 406 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 407 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4 408 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1 409 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4 410 %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2 411 %l3 = load <8 x i16>, <8 x i16>* %s3, align 4 412 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 413 %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 414 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 415 store <24 x i16> %s, <24 x i16> *%dst 416 ret void 417} 418 419define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) { 420; CHECK-LABEL: vst3_v16i16: 421; CHECK: @ %bb.0: @ %entry 422; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 423; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 424; CHECK-NEXT: .pad #80 425; CHECK-NEXT: sub sp, #80 426; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 427; CHECK-NEXT: vldrw.u32 q3, [r0, #80] 428; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 429; CHECK-NEXT: vmovx.f16 s0, s14 430; CHECK-NEXT: vmovx.f16 s8, s22 431; CHECK-NEXT: vins.f16 s8, s0 432; CHECK-NEXT: vmovx.f16 s0, s15 433; CHECK-NEXT: vins.f16 s9, s23 434; CHECK-NEXT: vmov.u16 r2, q6[1] 435; CHECK-NEXT: vmovx.f16 s11, s23 436; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill 437; CHECK-NEXT: vins.f16 s11, s0 438; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill 439; CHECK-NEXT: vmov.f32 s10, s15 440; CHECK-NEXT: vmovx.f16 s4, s9 441; CHECK-NEXT: vmov q4, q2 442; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 443; CHECK-NEXT: vmov.f32 s1, s11 444; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill 445; CHECK-NEXT: vmov.f32 s2, s11 446; CHECK-NEXT: vins.f16 s1, s4 447; CHECK-NEXT: vmovx.f16 s4, s2 448; CHECK-NEXT: vins.f16 s18, s4 449; CHECK-NEXT: vldrw.u32 q1, [r0] 450; CHECK-NEXT: vmov.f32 s2, s18 451; CHECK-NEXT: vmov.f64 d4, d2 452; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill 453; CHECK-NEXT: vmovx.f16 s28, s4 454; CHECK-NEXT: vins.f16 s8, s24 455; CHECK-NEXT: vmov.f32 s17, s1 456; CHECK-NEXT: vmov.16 q2[4], r2 457; CHECK-NEXT: vmov.f32 s11, s5 458; CHECK-NEXT: vins.f16 s11, s25 459; CHECK-NEXT: vldrw.u32 q6, [r0, #64] 460; CHECK-NEXT: vmov.f32 s9, s4 461; CHECK-NEXT: vmov.u16 r0, q5[1] 462; CHECK-NEXT: vmov.f32 s5, s24 463; CHECK-NEXT: vmov.f32 s6, s24 464; CHECK-NEXT: vins.f16 s5, s28 465; CHECK-NEXT: vmovx.f16 s28, s6 466; CHECK-NEXT: vins.f16 s10, s28 467; CHECK-NEXT: vmov.f32 s18, s2 468; CHECK-NEXT: vmov.f32 s6, s10 469; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill 470; CHECK-NEXT: vmov.f32 s9, s5 471; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload 472; CHECK-NEXT: vmov.f32 s10, s6 473; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload 474; CHECK-NEXT: vstrw.32 q2, [r1] 475; CHECK-NEXT: vmov.f64 d14, d2 476; CHECK-NEXT: vins.f16 s28, s20 477; CHECK-NEXT: vmov.f32 s0, s5 478; CHECK-NEXT: vins.f16 s0, s21 479; CHECK-NEXT: vmov.16 q7[4], r0 480; CHECK-NEXT: vmov.f32 s31, s0 481; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload 482; CHECK-NEXT: vmov.f32 s1, s12 483; CHECK-NEXT: vmov.f32 s29, s4 484; CHECK-NEXT: vmovx.f16 s4, s4 485; CHECK-NEXT: vmov.f32 s2, s12 486; CHECK-NEXT: vins.f16 s1, s4 487; CHECK-NEXT: vmovx.f16 s4, s2 488; CHECK-NEXT: vins.f16 s30, s4 489; CHECK-NEXT: vmovx.f16 s4, s26 490; CHECK-NEXT: vmov.f32 s2, s30 491; CHECK-NEXT: vmov.f32 s29, s1 492; CHECK-NEXT: vmov.f32 s12, s13 493; CHECK-NEXT: vmov.f32 s30, s2 494; CHECK-NEXT: vmovx.f16 s0, s18 495; CHECK-NEXT: vins.f16 s0, s4 496; CHECK-NEXT: vmov q1, q4 497; CHECK-NEXT: vins.f16 s1, s7 498; CHECK-NEXT: vstrw.32 q7, [r1, #48] 499; CHECK-NEXT: vmovx.f16 s3, s7 500; CHECK-NEXT: vmovx.f16 s4, s27 501; CHECK-NEXT: vins.f16 s3, s4 502; CHECK-NEXT: vmov.f32 s5, s23 503; CHECK-NEXT: vmov.f32 s2, s27 504; CHECK-NEXT: vmovx.f16 s16, s1 505; CHECK-NEXT: vmov.f32 s6, s23 506; CHECK-NEXT: vins.f16 s5, s16 507; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload 508; CHECK-NEXT: vmovx.f16 s20, s6 509; CHECK-NEXT: vmov.f32 s24, s25 510; CHECK-NEXT: vins.f16 s2, s20 511; CHECK-NEXT: vmovx.f16 s20, s17 512; CHECK-NEXT: vins.f16 s12, s20 513; CHECK-NEXT: vmovx.f16 s20, s18 514; CHECK-NEXT: vins.f16 s14, s20 515; CHECK-NEXT: vmov.f32 s6, s2 516; CHECK-NEXT: vmov.f32 s15, s14 517; CHECK-NEXT: vmov.f32 s14, s18 518; CHECK-NEXT: vmovx.f16 s16, s13 519; CHECK-NEXT: vstr s16, [sp, #32] @ 4-byte Spill 520; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 521; CHECK-NEXT: vmov.f32 s1, s5 522; CHECK-NEXT: vrev32.16 q5, q4 523; CHECK-NEXT: vldr s16, [sp, #32] @ 4-byte Reload 524; CHECK-NEXT: vins.f16 s21, s16 525; CHECK-NEXT: vmovx.f16 s16, s22 526; CHECK-NEXT: vins.f16 s14, s16 527; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload 528; CHECK-NEXT: vmov.f32 s2, s6 529; CHECK-NEXT: vmovx.f16 s4, s17 530; CHECK-NEXT: vmov.f32 s22, s14 531; CHECK-NEXT: vins.f16 s24, s4 532; CHECK-NEXT: vmovx.f16 s4, s18 533; CHECK-NEXT: vins.f16 s26, s4 534; CHECK-NEXT: vmov.f32 s13, s21 535; CHECK-NEXT: vmov.f32 s27, s26 536; CHECK-NEXT: vstrw.32 q0, [r1, #32] 537; CHECK-NEXT: vmov.f32 s26, s18 538; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload 539; CHECK-NEXT: vmovx.f16 s4, s25 540; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 541; CHECK-NEXT: vrev32.16 q4, q4 542; CHECK-NEXT: vins.f16 s17, s4 543; CHECK-NEXT: vmovx.f16 s4, s18 544; CHECK-NEXT: vins.f16 s26, s4 545; CHECK-NEXT: vmov.f32 s14, s22 546; CHECK-NEXT: vmov.f32 s18, s26 547; CHECK-NEXT: vstrw.32 q3, [r1, #64] 548; CHECK-NEXT: vmov.f32 s25, s17 549; CHECK-NEXT: vstrw.32 q0, [r1, #80] 550; CHECK-NEXT: vmov.f32 s26, s18 551; CHECK-NEXT: vstrw.32 q6, [r1, #16] 552; CHECK-NEXT: add sp, #80 553; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 554; CHECK-NEXT: bx lr 555entry: 556 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 557 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4 558 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1 559 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4 560 %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2 561 %l3 = load <16 x i16>, <16 x i16>* %s3, align 4 562 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 563 %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 564 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 565 store <48 x i16> %s, <48 x i16> *%dst 566 ret void 567} 568 569; i8 570 571define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) { 572; CHECK-LABEL: vst3_v2i8: 573; CHECK: @ %bb.0: @ %entry 574; CHECK-NEXT: .save {r4, r5, r6, lr} 575; CHECK-NEXT: push {r4, r5, r6, lr} 576; CHECK-NEXT: .pad #16 577; CHECK-NEXT: sub sp, #16 578; CHECK-NEXT: ldrb r2, [r0] 579; CHECK-NEXT: movs r6, #0 580; CHECK-NEXT: ldrb r3, [r0, #1] 581; CHECK-NEXT: ldrb.w r12, [r0, #2] 582; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 583; CHECK-NEXT: ldrb.w lr, [r0, #3] 584; CHECK-NEXT: vmov r4, s0 585; CHECK-NEXT: ldrb r5, [r0, #5] 586; CHECK-NEXT: vmov.16 q0[0], r4 587; CHECK-NEXT: ldrb r0, [r0, #4] 588; CHECK-NEXT: vmov.16 q0[1], r12 589; CHECK-NEXT: mov r2, sp 590; CHECK-NEXT: vmov.16 q0[2], r0 591; CHECK-NEXT: add r0, sp, #8 592; CHECK-NEXT: vmov.16 q0[3], r3 593; CHECK-NEXT: vmov.16 q0[4], lr 594; CHECK-NEXT: vmov.16 q0[5], r5 595; CHECK-NEXT: vmov.16 q0[6], r6 596; CHECK-NEXT: vmov.16 q0[7], r6 597; CHECK-NEXT: vstrb.16 q0, [r2] 598; CHECK-NEXT: vstrb.16 q0, [r0] 599; CHECK-NEXT: vldrh.u32 q0, [r0] 600; CHECK-NEXT: ldr r2, [sp] 601; CHECK-NEXT: str r2, [r1] 602; CHECK-NEXT: vmov r0, s2 603; CHECK-NEXT: strh r0, [r1, #4] 604; CHECK-NEXT: add sp, #16 605; CHECK-NEXT: pop {r4, r5, r6, pc} 606entry: 607 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 608 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 609 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1 610 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4 611 %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2 612 %l3 = load <2 x i8>, <2 x i8>* %s3, align 4 613 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 614 %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 615 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 616 store <6 x i8> %s, <6 x i8> *%dst 617 ret void 618} 619 620define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) { 621; CHECK-LABEL: vst3_v4i8: 622; CHECK: @ %bb.0: @ %entry 623; CHECK-NEXT: .save {r4, r5, r6, lr} 624; CHECK-NEXT: push {r4, r5, r6, lr} 625; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 626; CHECK-NEXT: vldrb.u32 q1, [r0] 627; CHECK-NEXT: vmov r2, lr, d0 628; CHECK-NEXT: vmov r12, r3, d1 629; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 630; CHECK-NEXT: vmov r0, r6, d3 631; CHECK-NEXT: vmov r4, r5, d1 632; CHECK-NEXT: vmov.8 q2[8], r4 633; CHECK-NEXT: vmov.8 q2[9], r6 634; CHECK-NEXT: vmov.8 q2[10], r3 635; CHECK-NEXT: vmov.8 q2[11], r5 636; CHECK-NEXT: vmov r3, s10 637; CHECK-NEXT: str r3, [r1, #8] 638; CHECK-NEXT: vmov r3, r4, d2 639; CHECK-NEXT: vmov.16 q1[0], r3 640; CHECK-NEXT: vmov r3, r5, d0 641; CHECK-NEXT: vmov.16 q1[1], r2 642; CHECK-NEXT: vmov.16 q1[2], r3 643; CHECK-NEXT: vmov.16 q1[3], r4 644; CHECK-NEXT: vmov.16 q1[4], lr 645; CHECK-NEXT: vmov.16 q1[5], r5 646; CHECK-NEXT: vmov.16 q1[6], r0 647; CHECK-NEXT: vmov.16 q1[7], r12 648; CHECK-NEXT: vstrb.16 q1, [r1] 649; CHECK-NEXT: pop {r4, r5, r6, pc} 650entry: 651 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 652 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 653 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1 654 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4 655 %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2 656 %l3 = load <4 x i8>, <4 x i8>* %s3, align 4 657 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 658 %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 659 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 660 store <12 x i8> %s, <12 x i8> *%dst 661 ret void 662} 663 664define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { 665; CHECK-LABEL: vst3_v8i8: 666; CHECK: @ %bb.0: @ %entry 667; CHECK-NEXT: .vsave {d8, d9, d10} 668; CHECK-NEXT: vpush {d8, d9, d10} 669; CHECK-NEXT: vldrb.u16 q1, [r0, #16] 670; CHECK-NEXT: vldrb.u16 q2, [r0, #8] 671; CHECK-NEXT: vmovx.f16 s12, s6 672; CHECK-NEXT: vmovx.f16 s0, s10 673; CHECK-NEXT: vins.f16 s0, s12 674; CHECK-NEXT: vmovx.f16 s12, s7 675; CHECK-NEXT: vins.f16 s1, s11 676; CHECK-NEXT: vmovx.f16 s3, s11 677; CHECK-NEXT: vins.f16 s3, s12 678; CHECK-NEXT: vldrb.u16 q3, [r0] 679; CHECK-NEXT: vmov.f32 s2, s7 680; CHECK-NEXT: vmovx.f16 s20, s1 681; CHECK-NEXT: vmov.f32 s17, s15 682; CHECK-NEXT: vmov.u16 r0, q3[0] 683; CHECK-NEXT: vmov.f32 s18, s15 684; CHECK-NEXT: vins.f16 s17, s20 685; CHECK-NEXT: vmovx.f16 s20, s18 686; CHECK-NEXT: vins.f16 s2, s20 687; CHECK-NEXT: vmov.f32 s18, s2 688; CHECK-NEXT: vmov.f32 s1, s17 689; CHECK-NEXT: vmov.f32 s2, s18 690; CHECK-NEXT: vmov.8 q4[0], r0 691; CHECK-NEXT: vmov.u16 r0, q2[0] 692; CHECK-NEXT: vstrb.16 q0, [r1, #16] 693; CHECK-NEXT: vmov.8 q4[1], r0 694; CHECK-NEXT: vmov.u16 r0, q1[0] 695; CHECK-NEXT: vmov.8 q4[2], r0 696; CHECK-NEXT: vmov.u16 r0, q3[1] 697; CHECK-NEXT: vmov.8 q4[3], r0 698; CHECK-NEXT: vmov.u16 r0, q2[1] 699; CHECK-NEXT: vmov.8 q4[4], r0 700; CHECK-NEXT: vmov.u16 r0, q1[1] 701; CHECK-NEXT: vmov.8 q4[5], r0 702; CHECK-NEXT: vmov.u16 r0, q3[2] 703; CHECK-NEXT: vmov.8 q4[6], r0 704; CHECK-NEXT: vmov.u16 r0, q2[2] 705; CHECK-NEXT: vmov.8 q4[7], r0 706; CHECK-NEXT: vmov.u16 r0, q1[2] 707; CHECK-NEXT: vmov.8 q4[8], r0 708; CHECK-NEXT: vmov.u16 r0, q3[3] 709; CHECK-NEXT: vmov.8 q4[9], r0 710; CHECK-NEXT: vmov.u16 r0, q2[3] 711; CHECK-NEXT: vmov.8 q4[10], r0 712; CHECK-NEXT: vmov.u16 r0, q1[3] 713; CHECK-NEXT: vmov.8 q4[11], r0 714; CHECK-NEXT: vmov.u16 r0, q3[4] 715; CHECK-NEXT: vmov.8 q4[12], r0 716; CHECK-NEXT: vmov.u16 r0, q2[4] 717; CHECK-NEXT: vmov.8 q4[13], r0 718; CHECK-NEXT: vmov.u16 r0, q1[4] 719; CHECK-NEXT: vmov.8 q4[14], r0 720; CHECK-NEXT: vmov.u16 r0, q3[5] 721; CHECK-NEXT: vmov.8 q4[15], r0 722; CHECK-NEXT: vstrw.32 q4, [r1] 723; CHECK-NEXT: vpop {d8, d9, d10} 724; CHECK-NEXT: bx lr 725entry: 726 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 727 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4 728 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1 729 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4 730 %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2 731 %l3 = load <8 x i8>, <8 x i8>* %s3, align 4 732 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 733 %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 734 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 735 store <24 x i8> %s, <24 x i8> *%dst 736 ret void 737} 738 739define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { 740; CHECK-LABEL: vst3_v16i8: 741; CHECK: @ %bb.0: @ %entry 742; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 743; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 744; CHECK-NEXT: vldrw.u32 q3, [r0] 745; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 746; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 747; CHECK-NEXT: vmov.u8 r3, q3[0] 748; CHECK-NEXT: vmov.u8 r0, q2[0] 749; CHECK-NEXT: vmov.8 q5[0], r3 750; CHECK-NEXT: vmov.u8 r2, q1[0] 751; CHECK-NEXT: vmov.8 q5[1], r0 752; CHECK-NEXT: vmov.u8 r0, q3[1] 753; CHECK-NEXT: vmov.8 q5[3], r0 754; CHECK-NEXT: vmov.u8 r0, q2[1] 755; CHECK-NEXT: vmov.8 q5[4], r0 756; CHECK-NEXT: vmov.u8 r0, q3[2] 757; CHECK-NEXT: vmov.8 q5[6], r0 758; CHECK-NEXT: vmov.u8 r0, q2[2] 759; CHECK-NEXT: vmov.8 q5[7], r0 760; CHECK-NEXT: vmov.u8 r0, q3[3] 761; CHECK-NEXT: vmov.8 q5[9], r0 762; CHECK-NEXT: vmov.u8 r0, q2[3] 763; CHECK-NEXT: vmov.8 q5[10], r0 764; CHECK-NEXT: vmov.u8 r0, q3[4] 765; CHECK-NEXT: vmov.8 q4[2], r2 766; CHECK-NEXT: vmov.u8 r2, q1[2] 767; CHECK-NEXT: vmov.8 q5[12], r0 768; CHECK-NEXT: vmov.u8 r0, q2[4] 769; CHECK-NEXT: vmov.8 q4[8], r2 770; CHECK-NEXT: vmov.u8 r2, q1[3] 771; CHECK-NEXT: vmov.8 q5[13], r0 772; CHECK-NEXT: vmov.u8 r0, q3[5] 773; CHECK-NEXT: vmov.8 q5[15], r0 774; CHECK-NEXT: vmov.8 q4[11], r2 775; CHECK-NEXT: vmov.u8 r2, q1[4] 776; CHECK-NEXT: vmov.u8 r0, q5[0] 777; CHECK-NEXT: vmov.8 q4[14], r2 778; CHECK-NEXT: vmov.8 q0[0], r0 779; CHECK-NEXT: vmov.f32 s17, s4 780; CHECK-NEXT: vmov.u8 r0, q5[1] 781; CHECK-NEXT: vmov.8 q0[1], r0 782; CHECK-NEXT: vmov.u8 r2, q4[2] 783; CHECK-NEXT: vmov.8 q0[2], r2 784; CHECK-NEXT: vmov.u8 r0, q5[3] 785; CHECK-NEXT: vmov.8 q0[3], r0 786; CHECK-NEXT: vmov.u8 r0, q5[4] 787; CHECK-NEXT: vmov.8 q0[4], r0 788; CHECK-NEXT: vmov.u8 r0, q4[5] 789; CHECK-NEXT: vmov.8 q0[5], r0 790; CHECK-NEXT: vmov.u8 r0, q5[6] 791; CHECK-NEXT: vmov.8 q0[6], r0 792; CHECK-NEXT: vmov.u8 r0, q5[7] 793; CHECK-NEXT: vmov.8 q0[7], r0 794; CHECK-NEXT: vmov.u8 r0, q4[8] 795; CHECK-NEXT: vmov.8 q0[8], r0 796; CHECK-NEXT: vmov.u8 r0, q5[9] 797; CHECK-NEXT: vmov.8 q0[9], r0 798; CHECK-NEXT: vmov.u8 r0, q5[10] 799; CHECK-NEXT: vmov.8 q0[10], r0 800; CHECK-NEXT: vmov.u8 r0, q4[11] 801; CHECK-NEXT: vmov.8 q0[11], r0 802; CHECK-NEXT: vmov.u8 r0, q5[12] 803; CHECK-NEXT: vmov.8 q0[12], r0 804; CHECK-NEXT: vmov.u8 r0, q5[13] 805; CHECK-NEXT: vmov.8 q0[13], r0 806; CHECK-NEXT: vmov.u8 r0, q4[14] 807; CHECK-NEXT: vmov.8 q0[14], r0 808; CHECK-NEXT: vmov.u8 r0, q5[15] 809; CHECK-NEXT: vmov.8 q0[15], r0 810; CHECK-NEXT: vmov.u8 r0, q2[5] 811; CHECK-NEXT: vmov.8 q5[0], r0 812; CHECK-NEXT: vmov.u8 r0, q1[5] 813; CHECK-NEXT: vmov.8 q5[1], r0 814; CHECK-NEXT: vmov.u8 r0, q2[6] 815; CHECK-NEXT: vmov.8 q5[3], r0 816; CHECK-NEXT: vmov.u8 r0, q1[6] 817; CHECK-NEXT: vmov.8 q5[4], r0 818; CHECK-NEXT: vmov.u8 r0, q2[7] 819; CHECK-NEXT: vmov.8 q5[6], r0 820; CHECK-NEXT: vmov.u8 r0, q1[7] 821; CHECK-NEXT: vmov.8 q5[7], r0 822; CHECK-NEXT: vmov.u8 r0, q2[8] 823; CHECK-NEXT: vmov.8 q5[9], r0 824; CHECK-NEXT: vmov.u8 r0, q1[8] 825; CHECK-NEXT: vmov.8 q5[10], r0 826; CHECK-NEXT: vmov.u8 r0, q2[9] 827; CHECK-NEXT: vmov.8 q5[12], r0 828; CHECK-NEXT: vmov.u8 r0, q1[9] 829; CHECK-NEXT: vmov.8 q5[13], r0 830; CHECK-NEXT: vmov.u8 r0, q2[10] 831; CHECK-NEXT: vmov.8 q5[15], r0 832; CHECK-NEXT: vstrw.32 q0, [r1] 833; CHECK-NEXT: vmov.u8 r0, q5[0] 834; CHECK-NEXT: vmov.8 q4[0], r0 835; CHECK-NEXT: vmov.u8 r0, q5[1] 836; CHECK-NEXT: vmov.8 q4[1], r0 837; CHECK-NEXT: vmov.u8 r0, q3[7] 838; CHECK-NEXT: vmov.8 q6[5], r0 839; CHECK-NEXT: vmov.u8 r0, q3[8] 840; CHECK-NEXT: vmov.8 q6[8], r0 841; CHECK-NEXT: vmov.u8 r0, q3[9] 842; CHECK-NEXT: vmov.8 q6[11], r0 843; CHECK-NEXT: vmov.f32 s24, s13 844; CHECK-NEXT: vmov.f32 s27, s14 845; CHECK-NEXT: vmov.u8 r0, q6[2] 846; CHECK-NEXT: vmov.8 q4[2], r0 847; CHECK-NEXT: vmov.u8 r0, q5[3] 848; CHECK-NEXT: vmov.8 q4[3], r0 849; CHECK-NEXT: vmov.u8 r0, q5[4] 850; CHECK-NEXT: vmov.8 q4[4], r0 851; CHECK-NEXT: vmov.u8 r0, q6[5] 852; CHECK-NEXT: vmov.8 q4[5], r0 853; CHECK-NEXT: vmov.u8 r0, q5[6] 854; CHECK-NEXT: vmov.8 q4[6], r0 855; CHECK-NEXT: vmov.u8 r0, q5[7] 856; CHECK-NEXT: vmov.8 q4[7], r0 857; CHECK-NEXT: vmov.u8 r0, q6[8] 858; CHECK-NEXT: vmov.8 q4[8], r0 859; CHECK-NEXT: vmov.u8 r0, q5[9] 860; CHECK-NEXT: vmov.8 q4[9], r0 861; CHECK-NEXT: vmov.u8 r0, q5[10] 862; CHECK-NEXT: vmov.8 q4[10], r0 863; CHECK-NEXT: vmov.u8 r0, q6[11] 864; CHECK-NEXT: vmov.8 q4[11], r0 865; CHECK-NEXT: vmov.u8 r0, q5[12] 866; CHECK-NEXT: vmov.8 q4[12], r0 867; CHECK-NEXT: vmov.u8 r0, q5[13] 868; CHECK-NEXT: vmov.8 q4[13], r0 869; CHECK-NEXT: vmov.u8 r0, q6[14] 870; CHECK-NEXT: vmov.8 q4[14], r0 871; CHECK-NEXT: vmov.u8 r0, q5[15] 872; CHECK-NEXT: vmov.8 q4[15], r0 873; CHECK-NEXT: vmov.u8 r0, q1[10] 874; CHECK-NEXT: vmov.8 q5[0], r0 875; CHECK-NEXT: vmov.u8 r0, q3[11] 876; CHECK-NEXT: vmov.8 q5[1], r0 877; CHECK-NEXT: vmov.u8 r0, q1[11] 878; CHECK-NEXT: vmov.8 q5[3], r0 879; CHECK-NEXT: vmov.u8 r0, q3[12] 880; CHECK-NEXT: vmov.8 q5[4], r0 881; CHECK-NEXT: vmov.u8 r0, q1[12] 882; CHECK-NEXT: vmov.8 q5[6], r0 883; CHECK-NEXT: vmov.u8 r0, q3[13] 884; CHECK-NEXT: vmov.8 q5[7], r0 885; CHECK-NEXT: vmov.u8 r0, q1[13] 886; CHECK-NEXT: vmov.8 q5[9], r0 887; CHECK-NEXT: vmov.u8 r0, q3[14] 888; CHECK-NEXT: vmov.8 q5[10], r0 889; CHECK-NEXT: vmov.u8 r0, q1[14] 890; CHECK-NEXT: vmov.8 q5[12], r0 891; CHECK-NEXT: vmov.u8 r0, q3[15] 892; CHECK-NEXT: vmov.8 q5[13], r0 893; CHECK-NEXT: vmov.u8 r0, q1[15] 894; CHECK-NEXT: vmov.8 q5[15], r0 895; CHECK-NEXT: vstrw.32 q4, [r1, #16] 896; CHECK-NEXT: vmov.u8 r0, q5[0] 897; CHECK-NEXT: vmov.8 q1[0], r0 898; CHECK-NEXT: vmov.u8 r0, q5[1] 899; CHECK-NEXT: vmov.8 q1[1], r0 900; CHECK-NEXT: vmov.u8 r0, q2[11] 901; CHECK-NEXT: vmov.8 q3[2], r0 902; CHECK-NEXT: vmov.u8 r0, q2[12] 903; CHECK-NEXT: vmov.8 q3[5], r0 904; CHECK-NEXT: vmov.u8 r0, q2[13] 905; CHECK-NEXT: vmov.8 q3[8], r0 906; CHECK-NEXT: vmov.u8 r0, q2[14] 907; CHECK-NEXT: vmov.8 q3[11], r0 908; CHECK-NEXT: vmov.u8 r0, q2[15] 909; CHECK-NEXT: vmov.8 q3[14], r0 910; CHECK-NEXT: vmov.u8 r0, q3[2] 911; CHECK-NEXT: vmov.8 q1[2], r0 912; CHECK-NEXT: vmov.u8 r0, q5[3] 913; CHECK-NEXT: vmov.8 q1[3], r0 914; CHECK-NEXT: vmov.u8 r0, q5[4] 915; CHECK-NEXT: vmov.8 q1[4], r0 916; CHECK-NEXT: vmov.u8 r0, q3[5] 917; CHECK-NEXT: vmov.8 q1[5], r0 918; CHECK-NEXT: vmov.u8 r0, q5[6] 919; CHECK-NEXT: vmov.8 q1[6], r0 920; CHECK-NEXT: vmov.u8 r0, q5[7] 921; CHECK-NEXT: vmov.8 q1[7], r0 922; CHECK-NEXT: vmov.u8 r0, q3[8] 923; CHECK-NEXT: vmov.8 q1[8], r0 924; CHECK-NEXT: vmov.u8 r0, q5[9] 925; CHECK-NEXT: vmov.8 q1[9], r0 926; CHECK-NEXT: vmov.u8 r0, q5[10] 927; CHECK-NEXT: vmov.8 q1[10], r0 928; CHECK-NEXT: vmov.u8 r0, q3[11] 929; CHECK-NEXT: vmov.8 q1[11], r0 930; CHECK-NEXT: vmov.u8 r0, q5[12] 931; CHECK-NEXT: vmov.8 q1[12], r0 932; CHECK-NEXT: vmov.u8 r0, q5[13] 933; CHECK-NEXT: vmov.8 q1[13], r0 934; CHECK-NEXT: vmov.u8 r0, q3[14] 935; CHECK-NEXT: vmov.8 q1[14], r0 936; CHECK-NEXT: vmov.u8 r0, q5[15] 937; CHECK-NEXT: vmov.8 q1[15], r0 938; CHECK-NEXT: vstrw.32 q1, [r1, #32] 939; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 940; CHECK-NEXT: bx lr 941entry: 942 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 943 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4 944 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1 945 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4 946 %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2 947 %l3 = load <16 x i8>, <16 x i8>* %s3, align 4 948 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 949 %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 950 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 951 store <48 x i8> %s, <48 x i8> *%dst 952 ret void 953} 954 955; i64 956 957define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { 958; CHECK-LABEL: vst3_v2i64: 959; CHECK: @ %bb.0: @ %entry 960; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 961; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 962; CHECK-NEXT: vldrw.u32 q1, [r0] 963; CHECK-NEXT: vmov.f64 d6, d5 964; CHECK-NEXT: vmov.f32 s13, s11 965; CHECK-NEXT: vmov.f32 s14, s2 966; CHECK-NEXT: vmov.f32 s15, s3 967; CHECK-NEXT: vmov.f32 s2, s6 968; CHECK-NEXT: vmov.f32 s3, s7 969; CHECK-NEXT: vmov.f32 s6, s8 970; CHECK-NEXT: vmov.f32 s7, s9 971; CHECK-NEXT: vstrb.8 q1, [r1], #32 972; CHECK-NEXT: vstrw.32 q3, [r1] 973; CHECK-NEXT: vstrw.32 q0, [r1, #-16] 974; CHECK-NEXT: bx lr 975entry: 976 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 977 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4 978 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1 979 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4 980 %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2 981 %l3 = load <2 x i64>, <2 x i64>* %s3, align 4 982 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 983 %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 984 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 985 store <6 x i64> %s, <6 x i64> *%dst 986 ret void 987} 988 989define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) { 990; CHECK-LABEL: vst3_v4i64: 991; CHECK: @ %bb.0: @ %entry 992; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 993; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 994; CHECK-NEXT: .pad #16 995; CHECK-NEXT: sub sp, #16 996; CHECK-NEXT: vldrw.u32 q1, [r0] 997; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 998; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 999; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1000; CHECK-NEXT: vmov.f64 d10, d2 1001; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 1002; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1003; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 1004; CHECK-NEXT: vmov.f32 s21, s5 1005; CHECK-NEXT: vmov.f32 s22, s28 1006; CHECK-NEXT: vmov.f32 s23, s29 1007; CHECK-NEXT: vmov.f64 d14, d12 1008; CHECK-NEXT: vstrw.32 q5, [r1] 1009; CHECK-NEXT: vmov.f32 s29, s25 1010; CHECK-NEXT: vmov.f64 d8, d7 1011; CHECK-NEXT: vmov.f32 s30, s12 1012; CHECK-NEXT: vmov.f32 s17, s15 1013; CHECK-NEXT: vmov.f32 s31, s13 1014; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload 1015; CHECK-NEXT: vmov.f32 s18, s2 1016; CHECK-NEXT: vstrw.32 q7, [r1, #48] 1017; CHECK-NEXT: vmov.f32 s4, s8 1018; CHECK-NEXT: vmov.f32 s19, s3 1019; CHECK-NEXT: vmov.f32 s2, s26 1020; CHECK-NEXT: vstrw.32 q4, [r1, #80] 1021; CHECK-NEXT: vmov.f32 s5, s9 1022; CHECK-NEXT: vmov.f32 s8, s14 1023; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1024; CHECK-NEXT: vmov.f32 s3, s27 1025; CHECK-NEXT: vmov.f32 s9, s15 1026; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1027; CHECK-NEXT: vstrw.32 q2, [r1, #32] 1028; CHECK-NEXT: add sp, #16 1029; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1030; CHECK-NEXT: bx lr 1031entry: 1032 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0 1033 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4 1034 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1 1035 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4 1036 %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2 1037 %l3 = load <4 x i64>, <4 x i64>* %s3, align 4 1038 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1039 %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1040 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1041 store <12 x i64> %s, <12 x i64> *%dst 1042 ret void 1043} 1044 1045; f32 1046 1047define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) { 1048; CHECK-LABEL: vst3_v2f32: 1049; CHECK: @ %bb.0: @ %entry 1050; CHECK-NEXT: vldr s0, [r0] 1051; CHECK-NEXT: vldr s3, [r0, #4] 1052; CHECK-NEXT: vldr s1, [r0, #8] 1053; CHECK-NEXT: ldr r2, [r0, #20] 1054; CHECK-NEXT: vldr s2, [r0, #16] 1055; CHECK-NEXT: ldr r0, [r0, #12] 1056; CHECK-NEXT: strd r0, r2, [r1, #16] 1057; CHECK-NEXT: vstrw.32 q0, [r1] 1058; CHECK-NEXT: bx lr 1059entry: 1060 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0 1061 %l1 = load <2 x float>, <2 x float>* %s1, align 4 1062 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1 1063 %l2 = load <2 x float>, <2 x float>* %s2, align 4 1064 %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2 1065 %l3 = load <2 x float>, <2 x float>* %s3, align 4 1066 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1067 %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1068 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1069 store <6 x float> %s, <6 x float> *%dst 1070 ret void 1071} 1072 1073define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) { 1074; CHECK-LABEL: vst3_v4f32: 1075; CHECK: @ %bb.0: @ %entry 1076; CHECK-NEXT: .vsave {d8, d9} 1077; CHECK-NEXT: vpush {d8, d9} 1078; CHECK-NEXT: vldrw.u32 q3, [r0] 1079; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1080; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1081; CHECK-NEXT: vmov.f64 d8, d6 1082; CHECK-NEXT: vmov.f32 s17, s0 1083; CHECK-NEXT: vmov.f32 s8, s1 1084; CHECK-NEXT: vmov.f32 s19, s13 1085; CHECK-NEXT: vmov.f32 s9, s5 1086; CHECK-NEXT: vmov.f32 s18, s4 1087; CHECK-NEXT: vmov.f32 s4, s6 1088; CHECK-NEXT: vstrw.32 q4, [r1] 1089; CHECK-NEXT: vmov.f32 s11, s2 1090; CHECK-NEXT: vmov.f32 s5, s15 1091; CHECK-NEXT: vmov.f32 s10, s14 1092; CHECK-NEXT: vmov.f32 s6, s3 1093; CHECK-NEXT: vstrw.32 q2, [r1, #16] 1094; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1095; CHECK-NEXT: vpop {d8, d9} 1096; CHECK-NEXT: bx lr 1097entry: 1098 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 1099 %l1 = load <4 x float>, <4 x float>* %s1, align 4 1100 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1 1101 %l2 = load <4 x float>, <4 x float>* %s2, align 4 1102 %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2 1103 %l3 = load <4 x float>, <4 x float>* %s3, align 4 1104 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1105 %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1106 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1107 store <12 x float> %s, <12 x float> *%dst 1108 ret void 1109} 1110 1111define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) { 1112; CHECK-LABEL: vst3_v8f32: 1113; CHECK: @ %bb.0: @ %entry 1114; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1115; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1116; CHECK-NEXT: .pad #16 1117; CHECK-NEXT: sub sp, #16 1118; CHECK-NEXT: vldrw.u32 q4, [r0] 1119; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 1120; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1121; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1122; CHECK-NEXT: vmov.f64 d10, d8 1123; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 1124; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill 1125; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 1126; CHECK-NEXT: vmov.f32 s21, s24 1127; CHECK-NEXT: vmov.f64 d12, d4 1128; CHECK-NEXT: vmov.f64 d6, d1 1129; CHECK-NEXT: vmov.f32 s25, s28 1130; CHECK-NEXT: vmov.f32 s13, s11 1131; CHECK-NEXT: vmov.f32 s27, s9 1132; CHECK-NEXT: vmov.f32 s15, s3 1133; CHECK-NEXT: vmov.f32 s26, s0 1134; CHECK-NEXT: vmov.f32 s0, s29 1135; CHECK-NEXT: vstrw.32 q6, [r1, #48] 1136; CHECK-NEXT: vmov.f32 s3, s30 1137; CHECK-NEXT: vmov.f32 s14, s31 1138; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 1139; CHECK-NEXT: vmov.f32 s23, s17 1140; CHECK-NEXT: vstrw.32 q3, [r1, #80] 1141; CHECK-NEXT: vmov.f32 s2, s10 1142; CHECK-NEXT: vmov.f32 s8, s29 1143; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1144; CHECK-NEXT: vmov.f32 s9, s5 1145; CHECK-NEXT: vmov.f32 s22, s4 1146; CHECK-NEXT: vmov.f32 s4, s6 1147; CHECK-NEXT: vstrw.32 q5, [r1] 1148; CHECK-NEXT: vmov.f32 s11, s30 1149; CHECK-NEXT: vmov.f32 s5, s19 1150; CHECK-NEXT: vmov.f32 s10, s18 1151; CHECK-NEXT: vmov.f32 s6, s31 1152; CHECK-NEXT: vstrw.32 q2, [r1, #16] 1153; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1154; CHECK-NEXT: add sp, #16 1155; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1156; CHECK-NEXT: bx lr 1157entry: 1158 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 1159 %l1 = load <8 x float>, <8 x float>* %s1, align 4 1160 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1 1161 %l2 = load <8 x float>, <8 x float>* %s2, align 4 1162 %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2 1163 %l3 = load <8 x float>, <8 x float>* %s3, align 4 1164 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1165 %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1166 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1167 store <24 x float> %s, <24 x float> *%dst 1168 ret void 1169} 1170 1171define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) { 1172; CHECK-LABEL: vst3_v16f32: 1173; CHECK: @ %bb.0: @ %entry 1174; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1175; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1176; CHECK-NEXT: .pad #160 1177; CHECK-NEXT: sub sp, #160 1178; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1179; CHECK-NEXT: vldrw.u32 q5, [r0, #112] 1180; CHECK-NEXT: vldrw.u32 q2, [r0, #128] 1181; CHECK-NEXT: vldrw.u32 q6, [r0] 1182; CHECK-NEXT: vmov.f32 s12, s1 1183; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill 1184; CHECK-NEXT: vldrw.u32 q5, [r0, #96] 1185; CHECK-NEXT: vmov.f32 s13, s9 1186; CHECK-NEXT: vmov.f32 s15, s2 1187; CHECK-NEXT: vldrw.u32 q4, [r0, #160] 1188; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill 1189; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 1190; CHECK-NEXT: vldrw.u32 q7, [r0, #32] 1191; CHECK-NEXT: vmov.f32 s14, s26 1192; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill 1193; CHECK-NEXT: vldrw.u32 q4, [r0, #144] 1194; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill 1195; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill 1196; CHECK-NEXT: vldrw.u32 q7, [r0, #16] 1197; CHECK-NEXT: vldrw.u32 q1, [r0, #176] 1198; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 1199; CHECK-NEXT: vstrw.32 q3, [r1, #16] 1200; CHECK-NEXT: vmov.f64 d6, d5 1201; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill 1202; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 1203; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill 1204; CHECK-NEXT: vmov.f32 s13, s27 1205; CHECK-NEXT: vmov.f32 s15, s11 1206; CHECK-NEXT: vmov.f32 s14, s3 1207; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1208; CHECK-NEXT: vmov.f64 d6, d3 1209; CHECK-NEXT: vmov.f32 s13, s23 1210; CHECK-NEXT: vmov.f32 s15, s7 1211; CHECK-NEXT: vmov.f32 s14, s31 1212; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill 1213; CHECK-NEXT: vmov.f64 d6, d12 1214; CHECK-NEXT: vmov.f32 s13, s0 1215; CHECK-NEXT: vmov.f32 s15, s25 1216; CHECK-NEXT: vmov.f32 s14, s8 1217; CHECK-NEXT: vmov q2, q7 1218; CHECK-NEXT: vmov.f64 d0, d10 1219; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 1220; CHECK-NEXT: vmov.f32 s1, s8 1221; CHECK-NEXT: vmov.f32 s3, s21 1222; CHECK-NEXT: vmov.f32 s2, s4 1223; CHECK-NEXT: vmov.f32 s4, s9 1224; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill 1225; CHECK-NEXT: vmov.f32 s7, s10 1226; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 1227; CHECK-NEXT: vmov.f32 s6, s22 1228; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill 1229; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload 1230; CHECK-NEXT: vmov.f64 d4, d1 1231; CHECK-NEXT: vmov q3, q1 1232; CHECK-NEXT: vmov.f32 s20, s5 1233; CHECK-NEXT: vmov.f32 s21, s1 1234; CHECK-NEXT: vmov.f32 s23, s6 1235; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 1236; CHECK-NEXT: vmov.f64 d12, d9 1237; CHECK-NEXT: vmov q7, q1 1238; CHECK-NEXT: vmov.f32 s9, s7 1239; CHECK-NEXT: vmov.f32 s22, s6 1240; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 1241; CHECK-NEXT: vmov.f32 s11, s3 1242; CHECK-NEXT: vmov q0, q7 1243; CHECK-NEXT: vmov.f32 s25, s7 1244; CHECK-NEXT: vstrw.32 q5, [r1, #112] 1245; CHECK-NEXT: vmov.f32 s27, s19 1246; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload 1247; CHECK-NEXT: vmov.f32 s10, s15 1248; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 1249; CHECK-NEXT: vmov.f32 s29, s16 1250; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 1251; CHECK-NEXT: vmov.f32 s31, s1 1252; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 1253; CHECK-NEXT: vmov.f32 s26, s15 1254; CHECK-NEXT: vstrw.32 q2, [r1, #128] 1255; CHECK-NEXT: vmov.f32 s30, s0 1256; CHECK-NEXT: vstrw.32 q6, [r1, #80] 1257; CHECK-NEXT: vmov.f64 d0, d2 1258; CHECK-NEXT: vstrw.32 q7, [r1, #96] 1259; CHECK-NEXT: vmov.f32 s1, s12 1260; CHECK-NEXT: vmov.f32 s3, s5 1261; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload 1262; CHECK-NEXT: vmov.f32 s2, s16 1263; CHECK-NEXT: vstrw.32 q0, [r1, #48] 1264; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload 1265; CHECK-NEXT: vmov.f32 s16, s13 1266; CHECK-NEXT: vstrw.32 q0, [r1, #144] 1267; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 1268; CHECK-NEXT: vmov.f32 s19, s14 1269; CHECK-NEXT: vstrw.32 q0, [r1, #160] 1270; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 1271; CHECK-NEXT: vmov.f32 s18, s6 1272; CHECK-NEXT: vstrw.32 q0, [r1, #176] 1273; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 1274; CHECK-NEXT: vstrw.32 q4, [r1, #64] 1275; CHECK-NEXT: vstrw.32 q0, [r1] 1276; CHECK-NEXT: add sp, #160 1277; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1278; CHECK-NEXT: bx lr 1279entry: 1280 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0 1281 %l1 = load <16 x float>, <16 x float>* %s1, align 4 1282 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1 1283 %l2 = load <16 x float>, <16 x float>* %s2, align 4 1284 %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2 1285 %l3 = load <16 x float>, <16 x float>* %s3, align 4 1286 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1287 %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1288 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1289 store <48 x float> %s, <48 x float> *%dst 1290 ret void 1291} 1292 1293; f16 1294 1295define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { 1296; CHECK-LABEL: vst3_v2f16: 1297; CHECK: @ %bb.0: @ %entry 1298; CHECK-NEXT: vldmia r0, {s0, s1} 1299; CHECK-NEXT: ldr r0, [r0, #8] 1300; CHECK-NEXT: vmovx.f16 s8, s0 1301; CHECK-NEXT: vins.f16 s0, s1 1302; CHECK-NEXT: vmov.32 q1[0], r0 1303; CHECK-NEXT: vmovx.f16 s2, s1 1304; CHECK-NEXT: vmovx.f16 s10, s4 1305; CHECK-NEXT: vins.f16 s4, s8 1306; CHECK-NEXT: vins.f16 s2, s10 1307; CHECK-NEXT: vmov.f32 s1, s4 1308; CHECK-NEXT: vmov r3, s2 1309; CHECK-NEXT: vmov r0, r2, d0 1310; CHECK-NEXT: stm r1!, {r0, r2, r3} 1311; CHECK-NEXT: bx lr 1312entry: 1313 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 1314 %l1 = load <2 x half>, <2 x half>* %s1, align 4 1315 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1 1316 %l2 = load <2 x half>, <2 x half>* %s2, align 4 1317 %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2 1318 %l3 = load <2 x half>, <2 x half>* %s3, align 4 1319 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1320 %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1321 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1322 store <6 x half> %s, <6 x half> *%dst 1323 ret void 1324} 1325 1326define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { 1327; CHECK-LABEL: vst3_v4f16: 1328; CHECK: @ %bb.0: @ %entry 1329; CHECK-NEXT: .save {r7, lr} 1330; CHECK-NEXT: push {r7, lr} 1331; CHECK-NEXT: .vsave {d8, d9} 1332; CHECK-NEXT: vpush {d8, d9} 1333; CHECK-NEXT: ldrd r2, r12, [r0] 1334; CHECK-NEXT: ldrd r3, lr, [r0, #8] 1335; CHECK-NEXT: vmov.32 q0[0], r2 1336; CHECK-NEXT: ldrd r2, r0, [r0, #16] 1337; CHECK-NEXT: vmov.32 q1[0], r3 1338; CHECK-NEXT: vmov.32 q0[1], r12 1339; CHECK-NEXT: vmov.32 q1[1], lr 1340; CHECK-NEXT: vmov.f32 s2, s4 1341; CHECK-NEXT: vmov.f32 s3, s5 1342; CHECK-NEXT: vmovx.f16 s10, s0 1343; CHECK-NEXT: vmov.f32 s8, s1 1344; CHECK-NEXT: vins.f16 s0, s2 1345; CHECK-NEXT: vins.f16 s8, s5 1346; CHECK-NEXT: vmov.32 q1[0], r2 1347; CHECK-NEXT: vmov.32 q1[1], r0 1348; CHECK-NEXT: vmovx.f16 s2, s2 1349; CHECK-NEXT: vmovx.f16 s12, s4 1350; CHECK-NEXT: vins.f16 s4, s10 1351; CHECK-NEXT: vins.f16 s2, s12 1352; CHECK-NEXT: vmovx.f16 s10, s1 1353; CHECK-NEXT: vmovx.f16 s12, s5 1354; CHECK-NEXT: vmovx.f16 s17, s3 1355; CHECK-NEXT: vins.f16 s5, s10 1356; CHECK-NEXT: vins.f16 s17, s12 1357; CHECK-NEXT: vmov.f32 s16, s5 1358; CHECK-NEXT: vmov.f32 s1, s4 1359; CHECK-NEXT: vmov.f32 s3, s8 1360; CHECK-NEXT: vstrw.32 q0, [r1] 1361; CHECK-NEXT: vmov r0, r2, d8 1362; CHECK-NEXT: strd r0, r2, [r1, #16] 1363; CHECK-NEXT: vpop {d8, d9} 1364; CHECK-NEXT: pop {r7, pc} 1365entry: 1366 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 1367 %l1 = load <4 x half>, <4 x half>* %s1, align 4 1368 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1 1369 %l2 = load <4 x half>, <4 x half>* %s2, align 4 1370 %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2 1371 %l3 = load <4 x half>, <4 x half>* %s3, align 4 1372 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1373 %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1374 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1375 store <12 x half> %s, <12 x half> *%dst 1376 ret void 1377} 1378 1379define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { 1380; CHECK-LABEL: vst3_v8f16: 1381; CHECK: @ %bb.0: @ %entry 1382; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} 1383; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} 1384; CHECK-NEXT: vldrw.u32 q2, [r0] 1385; CHECK-NEXT: vldrw.u32 q5, [r0, #16] 1386; CHECK-NEXT: vmov.f64 d0, d4 1387; CHECK-NEXT: vmovx.f16 s6, s20 1388; CHECK-NEXT: vmovx.f16 s12, s8 1389; CHECK-NEXT: vmov.f32 s4, s9 1390; CHECK-NEXT: vins.f16 s0, s20 1391; CHECK-NEXT: vmov r2, s6 1392; CHECK-NEXT: vins.f16 s4, s21 1393; CHECK-NEXT: vmov.16 q0[4], r2 1394; CHECK-NEXT: vmov.f32 s3, s4 1395; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1396; CHECK-NEXT: vmov.f32 s1, s8 1397; CHECK-NEXT: vmov.f32 s17, s4 1398; CHECK-NEXT: vmovx.f16 s24, s7 1399; CHECK-NEXT: vmov.f32 s18, s4 1400; CHECK-NEXT: vins.f16 s17, s12 1401; CHECK-NEXT: vmovx.f16 s12, s18 1402; CHECK-NEXT: vins.f16 s2, s12 1403; CHECK-NEXT: vmovx.f16 s15, s23 1404; CHECK-NEXT: vins.f16 s15, s24 1405; CHECK-NEXT: vmovx.f16 s24, s6 1406; CHECK-NEXT: vmovx.f16 s12, s22 1407; CHECK-NEXT: vmov.f32 s18, s2 1408; CHECK-NEXT: vins.f16 s12, s24 1409; CHECK-NEXT: vmov.f32 s25, s11 1410; CHECK-NEXT: vins.f16 s13, s23 1411; CHECK-NEXT: vmov.f32 s26, s11 1412; CHECK-NEXT: vmov.f32 s14, s7 1413; CHECK-NEXT: vmovx.f16 s28, s13 1414; CHECK-NEXT: vins.f16 s25, s28 1415; CHECK-NEXT: vmovx.f16 s28, s26 1416; CHECK-NEXT: vins.f16 s14, s28 1417; CHECK-NEXT: vmovx.f16 s28, s9 1418; CHECK-NEXT: vmov.f32 s4, s5 1419; CHECK-NEXT: vrev32.16 q5, q5 1420; CHECK-NEXT: vins.f16 s4, s28 1421; CHECK-NEXT: vmovx.f16 s28, s10 1422; CHECK-NEXT: vins.f16 s6, s28 1423; CHECK-NEXT: vmov.f32 s26, s14 1424; CHECK-NEXT: vmov.f32 s7, s6 1425; CHECK-NEXT: vmov.f32 s6, s10 1426; CHECK-NEXT: vmovx.f16 s8, s5 1427; CHECK-NEXT: vins.f16 s21, s8 1428; CHECK-NEXT: vmovx.f16 s8, s22 1429; CHECK-NEXT: vins.f16 s6, s8 1430; CHECK-NEXT: vmov.f32 s1, s17 1431; CHECK-NEXT: vmov.f32 s22, s6 1432; CHECK-NEXT: vmov.f32 s13, s25 1433; CHECK-NEXT: vmov.f32 s5, s21 1434; CHECK-NEXT: vmov.f32 s2, s18 1435; CHECK-NEXT: vmov.f32 s14, s26 1436; CHECK-NEXT: vstrw.32 q0, [r1] 1437; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1438; CHECK-NEXT: vmov.f32 s6, s22 1439; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1440; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} 1441; CHECK-NEXT: bx lr 1442entry: 1443 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 1444 %l1 = load <8 x half>, <8 x half>* %s1, align 4 1445 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1 1446 %l2 = load <8 x half>, <8 x half>* %s2, align 4 1447 %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2 1448 %l3 = load <8 x half>, <8 x half>* %s3, align 4 1449 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1450 %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1451 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1452 store <24 x half> %s, <24 x half> *%dst 1453 ret void 1454} 1455 1456define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) { 1457; CHECK-LABEL: vst3_v16f16: 1458; CHECK: @ %bb.0: @ %entry 1459; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1460; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1461; CHECK-NEXT: .pad #128 1462; CHECK-NEXT: sub sp, #128 1463; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1464; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 1465; CHECK-NEXT: vldrw.u32 q6, [r0] 1466; CHECK-NEXT: vldrw.u32 q5, [r0, #16] 1467; CHECK-NEXT: vmovx.f16 s0, s19 1468; CHECK-NEXT: vmovx.f16 s7, s15 1469; CHECK-NEXT: vins.f16 s7, s0 1470; CHECK-NEXT: vmovx.f16 s0, s18 1471; CHECK-NEXT: vmovx.f16 s4, s14 1472; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill 1473; CHECK-NEXT: vins.f16 s4, s0 1474; CHECK-NEXT: vmov.f64 d14, d12 1475; CHECK-NEXT: vins.f16 s5, s15 1476; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill 1477; CHECK-NEXT: vmov.f32 s6, s19 1478; CHECK-NEXT: vmovx.f16 s0, s5 1479; CHECK-NEXT: vmov q2, q1 1480; CHECK-NEXT: vmov.f32 s5, s27 1481; CHECK-NEXT: vmov.f32 s6, s27 1482; CHECK-NEXT: vins.f16 s28, s12 1483; CHECK-NEXT: vins.f16 s5, s0 1484; CHECK-NEXT: vmovx.f16 s0, s6 1485; CHECK-NEXT: vins.f16 s10, s0 1486; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill 1487; CHECK-NEXT: vmov.f64 d2, d10 1488; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill 1489; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1490; CHECK-NEXT: vmovx.f16 s2, s8 1491; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill 1492; CHECK-NEXT: vmov.f32 s0, s21 1493; CHECK-NEXT: vins.f16 s4, s8 1494; CHECK-NEXT: vmov r2, s2 1495; CHECK-NEXT: vins.f16 s0, s9 1496; CHECK-NEXT: vmov.16 q1[4], r2 1497; CHECK-NEXT: vmovx.f16 s2, s12 1498; CHECK-NEXT: vmov.f32 s7, s0 1499; CHECK-NEXT: vmovx.f16 s0, s20 1500; CHECK-NEXT: vmov.f32 s5, s20 1501; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 1502; CHECK-NEXT: vmov r0, s2 1503; CHECK-NEXT: vmov.f32 s9, s20 1504; CHECK-NEXT: vmov.16 q7[4], r0 1505; CHECK-NEXT: vmov.f32 s10, s20 1506; CHECK-NEXT: vins.f16 s9, s0 1507; CHECK-NEXT: vmovx.f16 s0, s10 1508; CHECK-NEXT: vins.f16 s6, s0 1509; CHECK-NEXT: vmov.f32 s0, s25 1510; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill 1511; CHECK-NEXT: vmov q2, q4 1512; CHECK-NEXT: vins.f16 s0, s13 1513; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill 1514; CHECK-NEXT: vmov.f32 s5, s8 1515; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload 1516; CHECK-NEXT: vmov.f32 s31, s0 1517; CHECK-NEXT: vmovx.f16 s0, s24 1518; CHECK-NEXT: vmov.f32 s6, s8 1519; CHECK-NEXT: vins.f16 s5, s0 1520; CHECK-NEXT: vmov.f32 s29, s24 1521; CHECK-NEXT: vmovx.f16 s0, s6 1522; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill 1523; CHECK-NEXT: vins.f16 s30, s0 1524; CHECK-NEXT: vmovx.f16 s0, s22 1525; CHECK-NEXT: vmovx.f16 s4, s14 1526; CHECK-NEXT: vmov.f32 s8, s9 1527; CHECK-NEXT: vins.f16 s4, s0 1528; CHECK-NEXT: vmovx.f16 s0, s23 1529; CHECK-NEXT: vmovx.f16 s7, s15 1530; CHECK-NEXT: vins.f16 s7, s0 1531; CHECK-NEXT: vins.f16 s5, s15 1532; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 1533; CHECK-NEXT: vmov.f32 s6, s23 1534; CHECK-NEXT: vmovx.f16 s16, s5 1535; CHECK-NEXT: vmov.f32 s1, s15 1536; CHECK-NEXT: vmov.f32 s2, s15 1537; CHECK-NEXT: vins.f16 s1, s16 1538; CHECK-NEXT: vmovx.f16 s16, s2 1539; CHECK-NEXT: vins.f16 s6, s16 1540; CHECK-NEXT: vmovx.f16 s16, s13 1541; CHECK-NEXT: vmov.f32 s20, s21 1542; CHECK-NEXT: vins.f16 s20, s16 1543; CHECK-NEXT: vmovx.f16 s16, s14 1544; CHECK-NEXT: vins.f16 s22, s16 1545; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload 1546; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload 1547; CHECK-NEXT: vmov.f32 s23, s22 1548; CHECK-NEXT: vmov.f32 s14, s18 1549; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill 1550; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload 1551; CHECK-NEXT: vmov.f32 s14, s30 1552; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 1553; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 1554; CHECK-NEXT: vmov.f32 s2, s6 1555; CHECK-NEXT: vmov.f32 s22, s14 1556; CHECK-NEXT: vmovx.f16 s12, s21 1557; CHECK-NEXT: vstr s12, [sp, #64] @ 4-byte Spill 1558; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload 1559; CHECK-NEXT: vmov.f32 s5, s1 1560; CHECK-NEXT: vrev32.16 q4, q3 1561; CHECK-NEXT: vldr s12, [sp, #64] @ 4-byte Reload 1562; CHECK-NEXT: vins.f16 s17, s12 1563; CHECK-NEXT: vmovx.f16 s12, s18 1564; CHECK-NEXT: vins.f16 s22, s12 1565; CHECK-NEXT: vmovx.f16 s12, s25 1566; CHECK-NEXT: vmov.f32 s6, s2 1567; CHECK-NEXT: vins.f16 s8, s12 1568; CHECK-NEXT: vmovx.f16 s0, s26 1569; CHECK-NEXT: vmov.f32 s18, s22 1570; CHECK-NEXT: vins.f16 s10, s0 1571; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 1572; CHECK-NEXT: vmov.f32 s11, s10 1573; CHECK-NEXT: vstrw.32 q1, [r1, #80] 1574; CHECK-NEXT: vmov.f32 s10, s26 1575; CHECK-NEXT: vrev32.16 q6, q0 1576; CHECK-NEXT: vmovx.f16 s12, s9 1577; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload 1578; CHECK-NEXT: vins.f16 s25, s12 1579; CHECK-NEXT: vmovx.f16 s12, s26 1580; CHECK-NEXT: vins.f16 s10, s12 1581; CHECK-NEXT: vmov.f32 s29, s1 1582; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload 1583; CHECK-NEXT: vmov.f32 s30, s2 1584; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 1585; CHECK-NEXT: vmov.f32 s26, s10 1586; CHECK-NEXT: vmov.f32 s1, s13 1587; CHECK-NEXT: vstrw.32 q7, [r1] 1588; CHECK-NEXT: vmov.f32 s2, s14 1589; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload 1590; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill 1591; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 1592; CHECK-NEXT: vmov.f32 s2, s14 1593; CHECK-NEXT: vmov.f32 s13, s1 1594; CHECK-NEXT: vmov.f32 s21, s17 1595; CHECK-NEXT: vmov.f32 s9, s25 1596; CHECK-NEXT: vmov.f32 s22, s18 1597; CHECK-NEXT: vmov.f32 s10, s26 1598; CHECK-NEXT: vstrw.32 q5, [r1, #64] 1599; CHECK-NEXT: vstrw.32 q2, [r1, #16] 1600; CHECK-NEXT: vmov.f32 s14, s2 1601; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 1602; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1603; CHECK-NEXT: vstrw.32 q0, [r1, #48] 1604; CHECK-NEXT: add sp, #128 1605; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1606; CHECK-NEXT: bx lr 1607entry: 1608 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 1609 %l1 = load <16 x half>, <16 x half>* %s1, align 4 1610 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1 1611 %l2 = load <16 x half>, <16 x half>* %s2, align 4 1612 %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2 1613 %l3 = load <16 x half>, <16 x half>* %s3, align 4 1614 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1615 %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1616 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1617 store <48 x half> %s, <48 x half> *%dst 1618 ret void 1619} 1620 1621; f64 1622 1623define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) { 1624; CHECK-LABEL: vst3_v2f64: 1625; CHECK: @ %bb.0: @ %entry 1626; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1627; CHECK-NEXT: vldrw.u32 q0, [r0] 1628; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1629; CHECK-NEXT: vmov.f64 d6, d2 1630; CHECK-NEXT: vmov.f64 d7, d1 1631; CHECK-NEXT: vmov.f64 d1, d4 1632; CHECK-NEXT: vstrw.32 q3, [r1, #16] 1633; CHECK-NEXT: vmov.f64 d2, d5 1634; CHECK-NEXT: vstrw.32 q0, [r1] 1635; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1636; CHECK-NEXT: bx lr 1637entry: 1638 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0 1639 %l1 = load <2 x double>, <2 x double>* %s1, align 4 1640 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1 1641 %l2 = load <2 x double>, <2 x double>* %s2, align 4 1642 %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2 1643 %l3 = load <2 x double>, <2 x double>* %s3, align 4 1644 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1645 %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1646 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1647 store <6 x double> %s, <6 x double> *%dst 1648 ret void 1649} 1650 1651define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) { 1652; CHECK-LABEL: vst3_v4f64: 1653; CHECK: @ %bb.0: @ %entry 1654; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1655; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1656; CHECK-NEXT: .pad #16 1657; CHECK-NEXT: sub sp, #16 1658; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 1659; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 1660; CHECK-NEXT: vldrw.u32 q1, [r0] 1661; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1662; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1663; CHECK-NEXT: vmov.f64 d6, d15 1664; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill 1665; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 1666; CHECK-NEXT: vmov.f64 d10, d2 1667; CHECK-NEXT: vmov.f64 d7, d1 1668; CHECK-NEXT: vmov.f64 d11, d12 1669; CHECK-NEXT: vstrw.32 q3, [r1, #80] 1670; CHECK-NEXT: vmov.f64 d12, d4 1671; CHECK-NEXT: vstrw.32 q5, [r1] 1672; CHECK-NEXT: vmov.f64 d1, d5 1673; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload 1674; CHECK-NEXT: vmov.f64 d2, d8 1675; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1676; CHECK-NEXT: vmov.f64 d13, d14 1677; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1678; CHECK-NEXT: vmov.f64 d8, d5 1679; CHECK-NEXT: vstrw.32 q6, [r1, #48] 1680; CHECK-NEXT: vstrw.32 q4, [r1, #32] 1681; CHECK-NEXT: add sp, #16 1682; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1683; CHECK-NEXT: bx lr 1684entry: 1685 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0 1686 %l1 = load <4 x double>, <4 x double>* %s1, align 4 1687 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1 1688 %l2 = load <4 x double>, <4 x double>* %s2, align 4 1689 %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2 1690 %l3 = load <4 x double>, <4 x double>* %s3, align 4 1691 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1692 %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1693 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1694 store <12 x double> %s, <12 x double> *%dst 1695 ret void 1696} 1697