1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) { 7; CHECK-LABEL: vld3_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r7, lr} 10; CHECK-NEXT: push {r7, lr} 11; CHECK-NEXT: vldrw.u32 q0, [r0] 12; CHECK-NEXT: ldrd r2, r0, [r0, #16] 13; CHECK-NEXT: vmov.f64 d2, d0 14; CHECK-NEXT: vmov.f32 s6, s3 15; CHECK-NEXT: vmov r12, lr, d0 16; CHECK-NEXT: vmov r3, s6 17; CHECK-NEXT: add r2, r3 18; CHECK-NEXT: add.w r3, r12, lr 19; CHECK-NEXT: add r0, r2 20; CHECK-NEXT: vmov r2, s2 21; CHECK-NEXT: add r2, r3 22; CHECK-NEXT: strd r2, r0, [r1] 23; CHECK-NEXT: pop {r7, pc} 24entry: 25 %l1 = load <6 x i32>, <6 x i32>* %src, align 4 26 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3> 27 %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4> 28 %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5> 29 %a1 = add <2 x i32> %s1, %s2 30 %a = add <2 x i32> %a1, %s3 31 store <2 x i32> %a, <2 x i32> *%dst 32 ret void 33} 34 35define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) { 36; CHECK-LABEL: vld3_v4i32: 37; CHECK: @ %bb.0: @ %entry 38; CHECK-NEXT: .vsave {d8, d9} 39; CHECK-NEXT: vpush {d8, d9} 40; CHECK-NEXT: vldrw.u32 q1, [r0] 41; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 42; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 43; CHECK-NEXT: vmov.f64 d4, d2 44; CHECK-NEXT: vmov.f32 s12, s5 45; CHECK-NEXT: vmov.f32 s9, s7 46; CHECK-NEXT: vmov.f32 s13, s0 47; CHECK-NEXT: vmov.f32 s10, s2 48; CHECK-NEXT: vmov.f32 s14, s3 49; CHECK-NEXT: vmov.f32 s0, s6 50; CHECK-NEXT: vmov.f32 s2, s16 51; CHECK-NEXT: vmov.f32 s15, s18 52; CHECK-NEXT: vmov.f32 s11, s17 53; CHECK-NEXT: vadd.i32 q2, q2, q3 54; CHECK-NEXT: vmov.f32 s3, s19 55; CHECK-NEXT: vadd.i32 q0, q2, q0 56; CHECK-NEXT: vstrw.32 q0, [r1] 57; CHECK-NEXT: vpop {d8, d9} 58; CHECK-NEXT: bx lr 59entry: 60 %l1 = load <12 x i32>, <12 x i32>* %src, align 4 61 %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 62 %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 63 %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 64 %a1 = add <4 x i32> %s1, %s2 65 %a = add <4 x i32> %a1, %s3 66 store <4 x i32> %a, <4 x i32> *%dst 67 ret void 68} 69 70define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) { 71; CHECK-LABEL: vld3_v8i32: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: .vsave {d8, d9, d10, d11} 74; CHECK-NEXT: vpush {d8, d9, d10, d11} 75; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 76; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 77; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 78; CHECK-NEXT: vmov.f64 d4, d2 79; CHECK-NEXT: vmov.f32 s12, s5 80; CHECK-NEXT: vmov.f32 s9, s7 81; CHECK-NEXT: vmov.f32 s13, s0 82; CHECK-NEXT: vmov.f32 s10, s2 83; CHECK-NEXT: vmov.f32 s14, s3 84; CHECK-NEXT: vmov.f32 s0, s6 85; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 86; CHECK-NEXT: vmov.f32 s2, s16 87; CHECK-NEXT: vmov.f32 s15, s18 88; CHECK-NEXT: vmov.f32 s11, s17 89; CHECK-NEXT: vadd.i32 q2, q2, q3 90; CHECK-NEXT: vmov.f32 s3, s19 91; CHECK-NEXT: vadd.i32 q0, q2, q0 92; CHECK-NEXT: vldrw.u32 q2, [r0] 93; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 94; CHECK-NEXT: vstrw.32 q0, [r1, #16] 95; CHECK-NEXT: vmov.f32 s16, s9 96; CHECK-NEXT: vmov.f64 d10, d4 97; CHECK-NEXT: vmov.f32 s17, s4 98; CHECK-NEXT: vmov.f32 s21, s11 99; CHECK-NEXT: vmov.f32 s18, s7 100; CHECK-NEXT: vmov.f32 s22, s6 101; CHECK-NEXT: vmov.f32 s4, s10 102; CHECK-NEXT: vmov.f32 s6, s12 103; CHECK-NEXT: vmov.f32 s19, s14 104; CHECK-NEXT: vmov.f32 s23, s13 105; CHECK-NEXT: vadd.i32 q4, q5, q4 106; CHECK-NEXT: vmov.f32 s7, s15 107; CHECK-NEXT: vadd.i32 q1, q4, q1 108; CHECK-NEXT: vstrw.32 q1, [r1] 109; CHECK-NEXT: vpop {d8, d9, d10, d11} 110; CHECK-NEXT: bx lr 111entry: 112 %l1 = load <24 x i32>, <24 x i32>* %src, align 4 113 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 114 %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 115 %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 116 %a1 = add <8 x i32> %s1, %s2 117 %a = add <8 x i32> %a1, %s3 118 store <8 x i32> %a, <8 x i32> *%dst 119 ret void 120} 121 122define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) { 123; CHECK-LABEL: vld3_v16i32: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 126; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 127; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 128; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 129; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 130; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 131; CHECK-NEXT: vmov.f64 d4, d2 132; CHECK-NEXT: vmov.f32 s12, s5 133; CHECK-NEXT: vmov.f32 s9, s7 134; CHECK-NEXT: vmov.f32 s13, s0 135; CHECK-NEXT: vmov.f32 s10, s2 136; CHECK-NEXT: vmov.f32 s14, s3 137; CHECK-NEXT: vmov.f32 s0, s6 138; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 139; CHECK-NEXT: vmov.f32 s2, s16 140; CHECK-NEXT: vmov.f32 s15, s18 141; CHECK-NEXT: vmov.f32 s11, s17 142; CHECK-NEXT: vadd.i32 q2, q2, q3 143; CHECK-NEXT: vmov.f32 s3, s19 144; CHECK-NEXT: vadd.i32 q0, q2, q0 145; CHECK-NEXT: vldrw.u32 q2, [r0] 146; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 147; CHECK-NEXT: vmov.f32 s16, s9 148; CHECK-NEXT: vmov.f64 d10, d4 149; CHECK-NEXT: vmov.f32 s17, s4 150; CHECK-NEXT: vmov.f32 s21, s11 151; CHECK-NEXT: vmov.f32 s18, s7 152; CHECK-NEXT: vmov.f32 s22, s6 153; CHECK-NEXT: vmov.f32 s4, s10 154; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 155; CHECK-NEXT: vmov.f32 s6, s12 156; CHECK-NEXT: vmov.f32 s19, s14 157; CHECK-NEXT: vmov.f32 s23, s13 158; CHECK-NEXT: vmov.f32 s7, s15 159; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 160; CHECK-NEXT: vadd.i32 q4, q5, q4 161; CHECK-NEXT: vmov.f32 s20, s13 162; CHECK-NEXT: vadd.i32 q1, q4, q1 163; CHECK-NEXT: vmov.f64 d8, d6 164; CHECK-NEXT: vmov.f32 s17, s15 165; CHECK-NEXT: vmov.f32 s21, s8 166; CHECK-NEXT: vmov.f32 s18, s10 167; CHECK-NEXT: vmov.f32 s22, s11 168; CHECK-NEXT: vmov.f32 s8, s14 169; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 170; CHECK-NEXT: vmov.f32 s10, s24 171; CHECK-NEXT: vmov.f32 s23, s26 172; CHECK-NEXT: vmov.f32 s19, s25 173; CHECK-NEXT: vadd.i32 q4, q4, q5 174; CHECK-NEXT: vmov.f32 s11, s27 175; CHECK-NEXT: vadd.i32 q2, q4, q2 176; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 177; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 178; CHECK-NEXT: vstrw.32 q2, [r1, #48] 179; CHECK-NEXT: vmov.f32 s24, s17 180; CHECK-NEXT: vstrw.32 q0, [r1, #16] 181; CHECK-NEXT: vmov.f64 d14, d8 182; CHECK-NEXT: vstrw.32 q1, [r1] 183; CHECK-NEXT: vmov.f32 s25, s12 184; CHECK-NEXT: vmov.f32 s29, s19 185; CHECK-NEXT: vmov.f32 s26, s15 186; CHECK-NEXT: vmov.f32 s30, s14 187; CHECK-NEXT: vmov.f32 s12, s18 188; CHECK-NEXT: vmov.f32 s14, s20 189; CHECK-NEXT: vmov.f32 s27, s22 190; CHECK-NEXT: vmov.f32 s31, s21 191; CHECK-NEXT: vadd.i32 q6, q7, q6 192; CHECK-NEXT: vmov.f32 s15, s23 193; CHECK-NEXT: vadd.i32 q3, q6, q3 194; CHECK-NEXT: vstrw.32 q3, [r1, #32] 195; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 196; CHECK-NEXT: bx lr 197entry: 198 %l1 = load <48 x i32>, <48 x i32>* %src, align 4 199 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 200 %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 201 %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 202 %a1 = add <16 x i32> %s1, %s2 203 %a = add <16 x i32> %a1, %s3 204 store <16 x i32> %a, <16 x i32> *%dst 205 ret void 206} 207 208; i16 209 210define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) { 211; CHECK-LABEL: vld3_v2i16: 212; CHECK: @ %bb.0: @ %entry 213; CHECK-NEXT: .pad #8 214; CHECK-NEXT: sub sp, #8 215; CHECK-NEXT: vldrh.u32 q0, [r0] 216; CHECK-NEXT: ldr r2, [r0, #8] 217; CHECK-NEXT: mov r3, sp 218; CHECK-NEXT: str r2, [sp] 219; CHECK-NEXT: vmov.f64 d2, d0 220; CHECK-NEXT: vmov.f32 s6, s3 221; CHECK-NEXT: vmov.f32 s8, s1 222; CHECK-NEXT: vmov.f64 d6, d1 223; CHECK-NEXT: vmov r0, s6 224; CHECK-NEXT: vldrh.u32 q1, [r3] 225; CHECK-NEXT: vmov.f32 s10, s4 226; CHECK-NEXT: vmov.f32 s14, s5 227; CHECK-NEXT: vmov r2, s10 228; CHECK-NEXT: add r0, r2 229; CHECK-NEXT: vmov r2, s14 230; CHECK-NEXT: add r0, r2 231; CHECK-NEXT: strh r0, [r1, #2] 232; CHECK-NEXT: vmov r0, s8 233; CHECK-NEXT: vmov r2, s0 234; CHECK-NEXT: add r0, r2 235; CHECK-NEXT: vmov r2, s12 236; CHECK-NEXT: add r0, r2 237; CHECK-NEXT: strh r0, [r1] 238; CHECK-NEXT: add sp, #8 239; CHECK-NEXT: bx lr 240entry: 241 %l1 = load <6 x i16>, <6 x i16>* %src, align 4 242 %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3> 243 %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4> 244 %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5> 245 %a1 = add <2 x i16> %s1, %s2 246 %a = add <2 x i16> %a1, %s3 247 store <2 x i16> %a, <2 x i16> *%dst 248 ret void 249} 250 251define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) { 252; CHECK-LABEL: vld3_v4i16: 253; CHECK: @ %bb.0: @ %entry 254; CHECK-NEXT: .save {r4, r5, r6, lr} 255; CHECK-NEXT: push {r4, r5, r6, lr} 256; CHECK-NEXT: vldrw.u32 q0, [r0] 257; CHECK-NEXT: vldrh.u32 q1, [r0, #16] 258; CHECK-NEXT: vmov.u16 r5, q0[6] 259; CHECK-NEXT: vmov.u16 r6, q0[0] 260; CHECK-NEXT: vmov r0, r3, d2 261; CHECK-NEXT: vmov.u16 lr, q0[2] 262; CHECK-NEXT: vmov r2, r4, d3 263; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 264; CHECK-NEXT: vmov.u16 r5, q0[7] 265; CHECK-NEXT: vmov.u16 r6, q0[1] 266; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 267; CHECK-NEXT: vmov.u16 r5, q0[3] 268; CHECK-NEXT: vmov.u16 r6, q0[4] 269; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 270; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 271; CHECK-NEXT: vmov.u16 r12, q0[5] 272; CHECK-NEXT: vadd.i32 q0, q1, q2 273; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 274; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 275; CHECK-NEXT: vadd.i32 q0, q0, q1 276; CHECK-NEXT: vstrh.32 q0, [r1] 277; CHECK-NEXT: pop {r4, r5, r6, pc} 278entry: 279 %l1 = load <12 x i16>, <12 x i16>* %src, align 4 280 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 281 %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 282 %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 283 %a1 = add <4 x i16> %s1, %s2 284 %a = add <4 x i16> %a1, %s3 285 store <4 x i16> %a, <4 x i16> *%dst 286 ret void 287} 288 289define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { 290; CHECK-LABEL: vld3_v8i16: 291; CHECK: @ %bb.0: @ %entry 292; CHECK-NEXT: .vsave {d8, d9, d10, d11} 293; CHECK-NEXT: vpush {d8, d9, d10, d11} 294; CHECK-NEXT: vldrw.u32 q1, [r0] 295; CHECK-NEXT: vmovx.f16 s8, s6 296; CHECK-NEXT: vmov.f32 s0, s5 297; CHECK-NEXT: vins.f16 s0, s8 298; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 299; CHECK-NEXT: vmovx.f16 s12, s9 300; CHECK-NEXT: vmov.f32 s1, s8 301; CHECK-NEXT: vins.f16 s1, s12 302; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 303; CHECK-NEXT: vmov.f32 s2, s11 304; CHECK-NEXT: vmov.u16 r0, q2[5] 305; CHECK-NEXT: vmovx.f16 s20, s15 306; CHECK-NEXT: vmov.f32 s19, s14 307; CHECK-NEXT: vins.f16 s19, s20 308; CHECK-NEXT: vmov.f32 s18, s12 309; CHECK-NEXT: vmov q5, q4 310; CHECK-NEXT: vmovnb.i32 q5, q0 311; CHECK-NEXT: vmov.f32 s2, s22 312; CHECK-NEXT: vmovx.f16 s20, s5 313; CHECK-NEXT: vmov.f32 s3, s19 314; CHECK-NEXT: vmov.f64 d8, d2 315; CHECK-NEXT: vins.f16 s16, s20 316; CHECK-NEXT: vmovx.f16 s20, s8 317; CHECK-NEXT: vmov.f32 s17, s7 318; CHECK-NEXT: vins.f16 s17, s20 319; CHECK-NEXT: vmovx.f16 s20, s11 320; CHECK-NEXT: vmov.f32 s18, s10 321; CHECK-NEXT: vins.f16 s18, s20 322; CHECK-NEXT: vmovx.f16 s20, s14 323; CHECK-NEXT: vmov.f32 s19, s13 324; CHECK-NEXT: vins.f16 s19, s20 325; CHECK-NEXT: vmovx.f16 s20, s4 326; CHECK-NEXT: vins.f16 s20, s6 327; CHECK-NEXT: vmovx.f16 s21, s7 328; CHECK-NEXT: vins.f16 s6, s12 329; CHECK-NEXT: vmovx.f16 s7, s13 330; CHECK-NEXT: vins.f16 s21, s9 331; CHECK-NEXT: vins.f16 s7, s15 332; CHECK-NEXT: vmov.16 q5[4], r0 333; CHECK-NEXT: vmov q2, q1 334; CHECK-NEXT: vmovnb.i32 q2, q5 335; CHECK-NEXT: vmov.f32 s22, s10 336; CHECK-NEXT: vmov.f32 s23, s7 337; CHECK-NEXT: vadd.i16 q1, q4, q5 338; CHECK-NEXT: vadd.i16 q0, q1, q0 339; CHECK-NEXT: vstrw.32 q0, [r1] 340; CHECK-NEXT: vpop {d8, d9, d10, d11} 341; CHECK-NEXT: bx lr 342entry: 343 %l1 = load <24 x i16>, <24 x i16>* %src, align 4 344 %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 345 %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 346 %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 347 %a1 = add <8 x i16> %s1, %s2 348 %a = add <8 x i16> %a1, %s3 349 store <8 x i16> %a, <8 x i16> *%dst 350 ret void 351} 352 353define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { 354; CHECK-LABEL: vld3_v16i16: 355; CHECK: @ %bb.0: @ %entry 356; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 357; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 358; CHECK-NEXT: .pad #16 359; CHECK-NEXT: sub sp, #16 360; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 361; CHECK-NEXT: vmov.f64 d0, d2 362; CHECK-NEXT: vmovx.f16 s8, s5 363; CHECK-NEXT: vins.f16 s0, s8 364; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 365; CHECK-NEXT: vmov.f32 s1, s7 366; CHECK-NEXT: vmovx.f16 s12, s8 367; CHECK-NEXT: vmovx.f16 s16, s9 368; CHECK-NEXT: vins.f16 s1, s12 369; CHECK-NEXT: vmovx.f16 s12, s11 370; CHECK-NEXT: vmov.f32 s2, s10 371; CHECK-NEXT: vmov.u16 r2, q2[5] 372; CHECK-NEXT: vins.f16 s2, s12 373; CHECK-NEXT: vmovx.f16 s12, s6 374; CHECK-NEXT: vins.f16 s5, s12 375; CHECK-NEXT: vmov.f32 s13, s8 376; CHECK-NEXT: vins.f16 s13, s16 377; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 378; CHECK-NEXT: vmov.f32 s12, s5 379; CHECK-NEXT: vmovx.f16 s20, s18 380; CHECK-NEXT: vmov.f32 s3, s17 381; CHECK-NEXT: vins.f16 s3, s20 382; CHECK-NEXT: vmovx.f16 s20, s19 383; CHECK-NEXT: vins.f16 s18, s20 384; CHECK-NEXT: vmov.f32 s14, s11 385; CHECK-NEXT: vmov.f32 s23, s18 386; CHECK-NEXT: vmov.f32 s22, s16 387; CHECK-NEXT: vmov q6, q5 388; CHECK-NEXT: vmovnb.i32 q6, q3 389; CHECK-NEXT: vmov.f32 s14, s26 390; CHECK-NEXT: vmov.f32 s15, s23 391; CHECK-NEXT: vmovx.f16 s20, s4 392; CHECK-NEXT: vins.f16 s20, s6 393; CHECK-NEXT: vmovx.f16 s21, s7 394; CHECK-NEXT: vins.f16 s6, s16 395; CHECK-NEXT: vmovx.f16 s7, s17 396; CHECK-NEXT: vins.f16 s21, s9 397; CHECK-NEXT: vins.f16 s7, s19 398; CHECK-NEXT: vmov.16 q5[4], r2 399; CHECK-NEXT: vmov q2, q1 400; CHECK-NEXT: vmovnb.i32 q2, q5 401; CHECK-NEXT: vmov.f32 s22, s10 402; CHECK-NEXT: vldrw.u32 q2, [r0] 403; CHECK-NEXT: vmov.f32 s23, s7 404; CHECK-NEXT: vadd.i16 q0, q0, q5 405; CHECK-NEXT: vmov.f32 s4, s9 406; CHECK-NEXT: vadd.i16 q0, q0, q3 407; CHECK-NEXT: vmovx.f16 s12, s10 408; CHECK-NEXT: vins.f16 s4, s12 409; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 410; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 411; CHECK-NEXT: vmovx.f16 s0, s9 412; CHECK-NEXT: vmovx.f16 s16, s13 413; CHECK-NEXT: vmov.f32 s5, s12 414; CHECK-NEXT: vins.f16 s5, s16 415; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 416; CHECK-NEXT: vmov.f32 s6, s15 417; CHECK-NEXT: vmov.u16 r0, q3[5] 418; CHECK-NEXT: vmovx.f16 s20, s19 419; CHECK-NEXT: vmov.f32 s27, s18 420; CHECK-NEXT: vins.f16 s27, s20 421; CHECK-NEXT: vmov.f64 d10, d4 422; CHECK-NEXT: vins.f16 s20, s0 423; CHECK-NEXT: vmov.f32 s26, s16 424; CHECK-NEXT: vmovx.f16 s0, s12 425; CHECK-NEXT: vmov.f32 s21, s11 426; CHECK-NEXT: vins.f16 s21, s0 427; CHECK-NEXT: vmov q7, q6 428; CHECK-NEXT: vmovnb.i32 q7, q1 429; CHECK-NEXT: vmovx.f16 s0, s15 430; CHECK-NEXT: vmov.f32 s22, s14 431; CHECK-NEXT: vins.f16 s22, s0 432; CHECK-NEXT: vmov.f32 s6, s30 433; CHECK-NEXT: vmov.f32 s7, s27 434; CHECK-NEXT: vmovx.f16 s24, s8 435; CHECK-NEXT: vmovx.f16 s0, s18 436; CHECK-NEXT: vmov.f32 s23, s17 437; CHECK-NEXT: vins.f16 s24, s10 438; CHECK-NEXT: vins.f16 s23, s0 439; CHECK-NEXT: vins.f16 s2, s16 440; CHECK-NEXT: vmovx.f16 s25, s11 441; CHECK-NEXT: vmovx.f16 s3, s17 442; CHECK-NEXT: vins.f16 s25, s13 443; CHECK-NEXT: vins.f16 s3, s19 444; CHECK-NEXT: vmov.16 q6[4], r0 445; CHECK-NEXT: vmov q2, q0 446; CHECK-NEXT: vmovnb.i32 q2, q6 447; CHECK-NEXT: vmov.f32 s26, s10 448; CHECK-NEXT: vmov.f32 s27, s3 449; CHECK-NEXT: vadd.i16 q0, q5, q6 450; CHECK-NEXT: vadd.i16 q0, q0, q1 451; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 452; CHECK-NEXT: vstrw.32 q0, [r1] 453; CHECK-NEXT: vstrw.32 q1, [r1, #16] 454; CHECK-NEXT: add sp, #16 455; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 456; CHECK-NEXT: bx lr 457entry: 458 %l1 = load <48 x i16>, <48 x i16>* %src, align 4 459 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 460 %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 461 %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 462 %a1 = add <16 x i16> %s1, %s2 463 %a = add <16 x i16> %a1, %s3 464 store <16 x i16> %a, <16 x i16> *%dst 465 ret void 466} 467 468; i8 469 470define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) { 471; CHECK-LABEL: vld3_v2i8: 472; CHECK: @ %bb.0: @ %entry 473; CHECK-NEXT: .pad #8 474; CHECK-NEXT: sub sp, #8 475; CHECK-NEXT: ldrd r2, r0, [r0] 476; CHECK-NEXT: strd r2, r0, [sp] 477; CHECK-NEXT: mov r0, sp 478; CHECK-NEXT: vldrb.u16 q0, [r0] 479; CHECK-NEXT: vmov.u16 r0, q0[4] 480; CHECK-NEXT: vmov.u16 r2, q0[3] 481; CHECK-NEXT: add r0, r2 482; CHECK-NEXT: vmov.u16 r2, q0[5] 483; CHECK-NEXT: add r0, r2 484; CHECK-NEXT: strb r0, [r1, #1] 485; CHECK-NEXT: vmov.u16 r0, q0[1] 486; CHECK-NEXT: vmov.u16 r2, q0[0] 487; CHECK-NEXT: add r0, r2 488; CHECK-NEXT: vmov.u16 r2, q0[2] 489; CHECK-NEXT: add r0, r2 490; CHECK-NEXT: strb r0, [r1] 491; CHECK-NEXT: add sp, #8 492; CHECK-NEXT: bx lr 493entry: 494 %l1 = load <6 x i8>, <6 x i8>* %src, align 4 495 %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3> 496 %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4> 497 %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5> 498 %a1 = add <2 x i8> %s1, %s2 499 %a = add <2 x i8> %a1, %s3 500 store <2 x i8> %a, <2 x i8> *%dst 501 ret void 502} 503 504define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) { 505; CHECK-LABEL: vld3_v4i8: 506; CHECK: @ %bb.0: @ %entry 507; CHECK-NEXT: .save {r4, lr} 508; CHECK-NEXT: push {r4, lr} 509; CHECK-NEXT: .pad #8 510; CHECK-NEXT: sub sp, #8 511; CHECK-NEXT: vldrb.u16 q0, [r0] 512; CHECK-NEXT: ldr r0, [r0, #8] 513; CHECK-NEXT: str r0, [sp] 514; CHECK-NEXT: vmov.u16 r3, q0[6] 515; CHECK-NEXT: vmov.u16 r4, q0[0] 516; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 517; CHECK-NEXT: vmov.u16 r3, q0[7] 518; CHECK-NEXT: vmov.u16 r4, q0[1] 519; CHECK-NEXT: vmov.u16 r12, q0[5] 520; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 521; CHECK-NEXT: mov r3, sp 522; CHECK-NEXT: vmov.u16 lr, q0[2] 523; CHECK-NEXT: vmov.u16 r2, q0[3] 524; CHECK-NEXT: vmov.u16 r0, q0[4] 525; CHECK-NEXT: vldrb.u16 q0, [r3] 526; CHECK-NEXT: vmov.u16 r3, q0[2] 527; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 528; CHECK-NEXT: vmov.u16 r0, q0[1] 529; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 530; CHECK-NEXT: vmov.u16 r0, q0[0] 531; CHECK-NEXT: vadd.i32 q1, q1, q2 532; CHECK-NEXT: vmov q2[2], q2[0], lr, r0 533; CHECK-NEXT: vmov.u16 r0, q0[3] 534; CHECK-NEXT: vmov q2[3], q2[1], r12, r0 535; CHECK-NEXT: vadd.i32 q0, q1, q2 536; CHECK-NEXT: vstrb.32 q0, [r1] 537; CHECK-NEXT: add sp, #8 538; CHECK-NEXT: pop {r4, pc} 539entry: 540 %l1 = load <12 x i8>, <12 x i8>* %src, align 4 541 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 542 %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 543 %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 544 %a1 = add <4 x i8> %s1, %s2 545 %a = add <4 x i8> %a1, %s3 546 store <4 x i8> %a, <4 x i8> *%dst 547 ret void 548} 549 550define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { 551; CHECK-LABEL: vld3_v8i8: 552; CHECK: @ %bb.0: @ %entry 553; CHECK-NEXT: .vsave {d8, d9} 554; CHECK-NEXT: vpush {d8, d9} 555; CHECK-NEXT: vldrw.u32 q0, [r0] 556; CHECK-NEXT: vldrb.u16 q1, [r0, #16] 557; CHECK-NEXT: vmov.u8 r2, q0[1] 558; CHECK-NEXT: vmov.u8 r0, q0[0] 559; CHECK-NEXT: vmov.16 q2[0], r2 560; CHECK-NEXT: vmov.u8 r2, q0[4] 561; CHECK-NEXT: vmov.16 q2[1], r2 562; CHECK-NEXT: vmov.u8 r2, q0[7] 563; CHECK-NEXT: vmov.16 q3[0], r0 564; CHECK-NEXT: vmov.u8 r0, q0[3] 565; CHECK-NEXT: vmov.16 q2[2], r2 566; CHECK-NEXT: vmov.u8 r2, q0[10] 567; CHECK-NEXT: vmov.16 q3[1], r0 568; CHECK-NEXT: vmov.u8 r0, q0[6] 569; CHECK-NEXT: vmov.16 q2[3], r2 570; CHECK-NEXT: vmov.u8 r2, q0[13] 571; CHECK-NEXT: vmov.16 q3[2], r0 572; CHECK-NEXT: vmov.u8 r0, q0[9] 573; CHECK-NEXT: vmov.16 q2[4], r2 574; CHECK-NEXT: vmov.16 q3[3], r0 575; CHECK-NEXT: vmov.u8 r0, q0[12] 576; CHECK-NEXT: vins.f16 s10, s4 577; CHECK-NEXT: vmov.16 q3[4], r0 578; CHECK-NEXT: vmov.u8 r0, q0[15] 579; CHECK-NEXT: vmovx.f16 s16, s6 580; CHECK-NEXT: vmov.f32 s18, s5 581; CHECK-NEXT: vmovx.f16 s11, s5 582; CHECK-NEXT: vmov.16 q3[5], r0 583; CHECK-NEXT: vins.f16 s18, s16 584; CHECK-NEXT: vins.f16 s11, s7 585; CHECK-NEXT: vmov.f32 s15, s18 586; CHECK-NEXT: vmov.u8 r0, q0[2] 587; CHECK-NEXT: vadd.i16 q2, q3, q2 588; CHECK-NEXT: vmov.16 q3[0], r0 589; CHECK-NEXT: vmov.u8 r0, q0[5] 590; CHECK-NEXT: vmov.16 q3[1], r0 591; CHECK-NEXT: vmov.u8 r0, q0[8] 592; CHECK-NEXT: vmov.16 q3[2], r0 593; CHECK-NEXT: vmov.u8 r0, q0[11] 594; CHECK-NEXT: vmov.16 q3[3], r0 595; CHECK-NEXT: vmov.u8 r0, q0[14] 596; CHECK-NEXT: vmov.16 q3[4], r0 597; CHECK-NEXT: vmov.u16 r0, q1[1] 598; CHECK-NEXT: vmovx.f16 s0, s7 599; CHECK-NEXT: vmov.f32 s2, s6 600; CHECK-NEXT: vins.f16 s2, s0 601; CHECK-NEXT: vmov.16 q3[5], r0 602; CHECK-NEXT: vmov.f32 s15, s2 603; CHECK-NEXT: vadd.i16 q0, q2, q3 604; CHECK-NEXT: vstrb.16 q0, [r1] 605; CHECK-NEXT: vpop {d8, d9} 606; CHECK-NEXT: bx lr 607entry: 608 %l1 = load <24 x i8>, <24 x i8>* %src, align 4 609 %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 610 %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 611 %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 612 %a1 = add <8 x i8> %s1, %s2 613 %a = add <8 x i8> %a1, %s3 614 store <8 x i8> %a, <8 x i8> *%dst 615 ret void 616} 617 618define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { 619; CHECK-LABEL: vld3_v16i8: 620; CHECK: @ %bb.0: @ %entry 621; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 622; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 623; CHECK-NEXT: vldrw.u32 q1, [r0] 624; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 625; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 626; CHECK-NEXT: vmov.u8 r2, q1[1] 627; CHECK-NEXT: vmov.8 q3[0], r2 628; CHECK-NEXT: vmov.u8 r2, q1[4] 629; CHECK-NEXT: vmov.8 q3[1], r2 630; CHECK-NEXT: vmov.u8 r2, q1[7] 631; CHECK-NEXT: vmov.8 q3[2], r2 632; CHECK-NEXT: vmov.u8 r2, q1[10] 633; CHECK-NEXT: vmov.8 q3[3], r2 634; CHECK-NEXT: vmov.u8 r2, q1[13] 635; CHECK-NEXT: vmov.8 q3[4], r2 636; CHECK-NEXT: vmov.u8 r2, q0[0] 637; CHECK-NEXT: vmov.8 q3[5], r2 638; CHECK-NEXT: vmov.u8 r2, q0[3] 639; CHECK-NEXT: vmov.8 q3[6], r2 640; CHECK-NEXT: vmov.u8 r2, q0[6] 641; CHECK-NEXT: vmov.8 q3[7], r2 642; CHECK-NEXT: vmov.u8 r2, q0[9] 643; CHECK-NEXT: vmov.u8 r0, q2[5] 644; CHECK-NEXT: vmov.8 q3[8], r2 645; CHECK-NEXT: vmov.u8 r2, q0[12] 646; CHECK-NEXT: vmov.8 q4[12], r0 647; CHECK-NEXT: vmov.u8 r0, q2[8] 648; CHECK-NEXT: vmov.8 q3[9], r2 649; CHECK-NEXT: vmov.u8 r2, q0[15] 650; CHECK-NEXT: vmov.8 q4[13], r0 651; CHECK-NEXT: vmov.u8 r0, q2[11] 652; CHECK-NEXT: vmov.8 q3[10], r2 653; CHECK-NEXT: vmov.8 q4[14], r0 654; CHECK-NEXT: vmov.u8 r0, q2[14] 655; CHECK-NEXT: vmov.8 q4[15], r0 656; CHECK-NEXT: vmov.u8 r0, q2[2] 657; CHECK-NEXT: vmov q5, q3 658; CHECK-NEXT: vmov.8 q5[11], r0 659; CHECK-NEXT: vmov.u8 r0, q1[0] 660; CHECK-NEXT: vmov.f32 s14, s22 661; CHECK-NEXT: vmov.f32 s15, s19 662; CHECK-NEXT: vmov.8 q4[0], r0 663; CHECK-NEXT: vmov.u8 r0, q1[3] 664; CHECK-NEXT: vmov.8 q4[1], r0 665; CHECK-NEXT: vmov.u8 r0, q1[6] 666; CHECK-NEXT: vmov.8 q4[2], r0 667; CHECK-NEXT: vmov.u8 r0, q1[9] 668; CHECK-NEXT: vmov.8 q4[3], r0 669; CHECK-NEXT: vmov.u8 r0, q1[12] 670; CHECK-NEXT: vmov.8 q4[4], r0 671; CHECK-NEXT: vmov.u8 r0, q1[15] 672; CHECK-NEXT: vmov.8 q4[5], r0 673; CHECK-NEXT: vmov.u8 r0, q0[2] 674; CHECK-NEXT: vmov.8 q4[6], r0 675; CHECK-NEXT: vmov.u8 r0, q0[5] 676; CHECK-NEXT: vmov.8 q4[7], r0 677; CHECK-NEXT: vmov.u8 r0, q0[8] 678; CHECK-NEXT: vmov.8 q4[8], r0 679; CHECK-NEXT: vmov.u8 r0, q0[11] 680; CHECK-NEXT: vmov.8 q4[9], r0 681; CHECK-NEXT: vmov.u8 r0, q0[14] 682; CHECK-NEXT: vmov.8 q4[10], r0 683; CHECK-NEXT: vmov.u8 r0, q2[4] 684; CHECK-NEXT: vmov.8 q5[12], r0 685; CHECK-NEXT: vmov.u8 r0, q2[7] 686; CHECK-NEXT: vmov.8 q5[13], r0 687; CHECK-NEXT: vmov.u8 r0, q2[10] 688; CHECK-NEXT: vmov.8 q5[14], r0 689; CHECK-NEXT: vmov.u8 r0, q2[13] 690; CHECK-NEXT: vmov.8 q5[15], r0 691; CHECK-NEXT: vmov.u8 r0, q2[1] 692; CHECK-NEXT: vmov q6, q4 693; CHECK-NEXT: vmov.8 q6[11], r0 694; CHECK-NEXT: vmov.u8 r0, q1[2] 695; CHECK-NEXT: vmov.f32 s18, s26 696; CHECK-NEXT: vmov.f32 s19, s23 697; CHECK-NEXT: vadd.i8 q3, q4, q3 698; CHECK-NEXT: vmov.8 q4[0], r0 699; CHECK-NEXT: vmov.u8 r0, q1[5] 700; CHECK-NEXT: vmov.8 q4[1], r0 701; CHECK-NEXT: vmov.u8 r0, q1[8] 702; CHECK-NEXT: vmov.8 q4[2], r0 703; CHECK-NEXT: vmov.u8 r0, q1[11] 704; CHECK-NEXT: vmov.8 q4[3], r0 705; CHECK-NEXT: vmov.u8 r0, q1[14] 706; CHECK-NEXT: vmov.8 q4[4], r0 707; CHECK-NEXT: vmov.u8 r0, q0[1] 708; CHECK-NEXT: vmov.8 q4[5], r0 709; CHECK-NEXT: vmov.u8 r0, q0[4] 710; CHECK-NEXT: vmov.8 q4[6], r0 711; CHECK-NEXT: vmov.u8 r0, q2[6] 712; CHECK-NEXT: vmov.8 q1[12], r0 713; CHECK-NEXT: vmov.u8 r0, q2[9] 714; CHECK-NEXT: vmov.8 q1[13], r0 715; CHECK-NEXT: vmov.u8 r0, q2[12] 716; CHECK-NEXT: vmov.8 q1[14], r0 717; CHECK-NEXT: vmov.u8 r0, q2[15] 718; CHECK-NEXT: vmov.8 q1[15], r0 719; CHECK-NEXT: vmov.u8 r0, q0[10] 720; CHECK-NEXT: vmov.8 q5[8], r0 721; CHECK-NEXT: vmov.u8 r0, q0[13] 722; CHECK-NEXT: vmov.8 q5[9], r0 723; CHECK-NEXT: vmov.u8 r0, q2[0] 724; CHECK-NEXT: vmov.8 q5[10], r0 725; CHECK-NEXT: vmov.u8 r0, q2[3] 726; CHECK-NEXT: vmov.8 q5[11], r0 727; CHECK-NEXT: vmov.u8 r0, q0[7] 728; CHECK-NEXT: vmov.8 q4[7], r0 729; CHECK-NEXT: vmov.f32 s18, s22 730; CHECK-NEXT: vmov.f32 s19, s7 731; CHECK-NEXT: vadd.i8 q0, q3, q4 732; CHECK-NEXT: vstrw.32 q0, [r1] 733; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 734; CHECK-NEXT: bx lr 735entry: 736 %l1 = load <48 x i8>, <48 x i8>* %src, align 4 737 %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 738 %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 739 %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 740 %a1 = add <16 x i8> %s1, %s2 741 %a = add <16 x i8> %a1, %s3 742 store <16 x i8> %a, <16 x i8> *%dst 743 ret void 744} 745 746; i64 747 748define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) { 749; CHECK-LABEL: vld3_v2i64: 750; CHECK: @ %bb.0: @ %entry 751; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 752; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 753; CHECK-NEXT: vldrw.u32 q0, [r0] 754; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 755; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 756; CHECK-NEXT: vmov.f64 d6, d1 757; CHECK-NEXT: vmov.f32 s13, s3 758; CHECK-NEXT: vmov.f32 s14, s4 759; CHECK-NEXT: vmov.f32 s2, s10 760; CHECK-NEXT: vmov.f32 s3, s11 761; CHECK-NEXT: vmov.f32 s15, s5 762; CHECK-NEXT: vmov.f32 s10, s6 763; CHECK-NEXT: vmov.f32 s11, s7 764; CHECK-NEXT: vmov r5, r8, d6 765; CHECK-NEXT: vmov r6, r7, d0 766; CHECK-NEXT: vmov r0, r3, d1 767; CHECK-NEXT: vmov lr, r12, d7 768; CHECK-NEXT: vmov r2, r4, d5 769; CHECK-NEXT: adds.w r0, r0, lr 770; CHECK-NEXT: adc.w r3, r3, r12 771; CHECK-NEXT: adds r0, r0, r2 772; CHECK-NEXT: adc.w r2, r3, r4 773; CHECK-NEXT: vmov r3, r4, d4 774; CHECK-NEXT: adds r6, r6, r5 775; CHECK-NEXT: adc.w r7, r7, r8 776; CHECK-NEXT: adds r3, r3, r6 777; CHECK-NEXT: adcs r7, r4 778; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 779; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 780; CHECK-NEXT: vstrw.32 q0, [r1] 781; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 782entry: 783 %l1 = load <6 x i64>, <6 x i64>* %src, align 4 784 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3> 785 %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4> 786 %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5> 787 %a1 = add <2 x i64> %s1, %s2 788 %a = add <2 x i64> %a1, %s3 789 store <2 x i64> %a, <2 x i64> *%dst 790 ret void 791} 792 793define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) { 794; CHECK-LABEL: vld3_v4i64: 795; CHECK: @ %bb.0: @ %entry 796; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 797; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 798; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 799; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 800; CHECK-NEXT: vldrw.u32 q0, [r0] 801; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 802; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 803; CHECK-NEXT: vldrw.u32 q6, [r0, #80] 804; CHECK-NEXT: vmov.f64 d2, d1 805; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 806; CHECK-NEXT: vmov.f32 s5, s3 807; CHECK-NEXT: vmov.f32 s6, s12 808; CHECK-NEXT: vmov.f32 s2, s10 809; CHECK-NEXT: vmov.f32 s3, s11 810; CHECK-NEXT: vmov.f32 s10, s14 811; CHECK-NEXT: vmov.f32 s7, s13 812; CHECK-NEXT: vmov.f32 s11, s15 813; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 814; CHECK-NEXT: vmov.f64 d10, d7 815; CHECK-NEXT: vmov lr, r12, d3 816; CHECK-NEXT: vmov r5, r4, d1 817; CHECK-NEXT: vmov r3, r8, d5 818; CHECK-NEXT: vmov.f32 s21, s15 819; CHECK-NEXT: vmov.f32 s22, s24 820; CHECK-NEXT: vmov.f32 s14, s18 821; CHECK-NEXT: vmov.f32 s23, s25 822; CHECK-NEXT: vmov.f32 s15, s19 823; CHECK-NEXT: vmov.f32 s18, s26 824; CHECK-NEXT: vmov r6, r7, d10 825; CHECK-NEXT: vmov.f32 s19, s27 826; CHECK-NEXT: adds.w r0, r5, lr 827; CHECK-NEXT: adc.w r5, r4, r12 828; CHECK-NEXT: adds.w lr, r0, r3 829; CHECK-NEXT: vmov r4, r2, d6 830; CHECK-NEXT: adc.w r12, r5, r8 831; CHECK-NEXT: vmov r5, r0, d8 832; CHECK-NEXT: adds r6, r6, r4 833; CHECK-NEXT: adcs r2, r7 834; CHECK-NEXT: adds r6, r6, r5 835; CHECK-NEXT: adc.w r8, r2, r0 836; CHECK-NEXT: vmov r7, r4, d11 837; CHECK-NEXT: vmov r2, r5, d7 838; CHECK-NEXT: vmov r3, r0, d0 839; CHECK-NEXT: adds r2, r2, r7 840; CHECK-NEXT: adc.w r7, r5, r4 841; CHECK-NEXT: vmov r5, r4, d9 842; CHECK-NEXT: adds r2, r2, r5 843; CHECK-NEXT: adcs r7, r4 844; CHECK-NEXT: vmov r5, r4, d2 845; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 846; CHECK-NEXT: vmov q1[3], q1[1], r8, r7 847; CHECK-NEXT: vstrw.32 q1, [r1, #16] 848; CHECK-NEXT: adds r3, r3, r5 849; CHECK-NEXT: adcs r0, r4 850; CHECK-NEXT: vmov r4, r5, d4 851; CHECK-NEXT: adds r3, r3, r4 852; CHECK-NEXT: vmov q0[2], q0[0], r3, lr 853; CHECK-NEXT: adcs r0, r5 854; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 855; CHECK-NEXT: vstrw.32 q0, [r1] 856; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 857; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 858entry: 859 %l1 = load <12 x i64>, <12 x i64>* %src, align 4 860 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 861 %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 862 %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 863 %a1 = add <4 x i64> %s1, %s2 864 %a = add <4 x i64> %a1, %s3 865 store <4 x i64> %a, <4 x i64> *%dst 866 ret void 867} 868 869; f32 870 871define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) { 872; CHECK-LABEL: vld3_v2f32: 873; CHECK: @ %bb.0: @ %entry 874; CHECK-NEXT: vldrw.u32 q2, [r0] 875; CHECK-NEXT: vldr s1, [r0, #16] 876; CHECK-NEXT: vldr s5, [r0, #20] 877; CHECK-NEXT: vmov.f64 d6, d4 878; CHECK-NEXT: vmov.f32 s13, s11 879; CHECK-NEXT: vmov.f32 s0, s9 880; CHECK-NEXT: vadd.f32 q0, q3, q0 881; CHECK-NEXT: vmov.f32 s4, s10 882; CHECK-NEXT: vadd.f32 q0, q0, q1 883; CHECK-NEXT: vstmia r1, {s0, s1} 884; CHECK-NEXT: bx lr 885entry: 886 %l1 = load <6 x float>, <6 x float>* %src, align 4 887 %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3> 888 %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4> 889 %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5> 890 %a1 = fadd <2 x float> %s1, %s2 891 %a = fadd <2 x float> %a1, %s3 892 store <2 x float> %a, <2 x float> *%dst 893 ret void 894} 895 896define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) { 897; CHECK-LABEL: vld3_v4f32: 898; CHECK: @ %bb.0: @ %entry 899; CHECK-NEXT: .vsave {d8, d9} 900; CHECK-NEXT: vpush {d8, d9} 901; CHECK-NEXT: vldrw.u32 q1, [r0] 902; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 903; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 904; CHECK-NEXT: vmov.f64 d4, d2 905; CHECK-NEXT: vmov.f32 s12, s5 906; CHECK-NEXT: vmov.f32 s9, s7 907; CHECK-NEXT: vmov.f32 s13, s0 908; CHECK-NEXT: vmov.f32 s10, s2 909; CHECK-NEXT: vmov.f32 s14, s3 910; CHECK-NEXT: vmov.f32 s0, s6 911; CHECK-NEXT: vmov.f32 s2, s16 912; CHECK-NEXT: vmov.f32 s15, s18 913; CHECK-NEXT: vmov.f32 s11, s17 914; CHECK-NEXT: vadd.f32 q2, q2, q3 915; CHECK-NEXT: vmov.f32 s3, s19 916; CHECK-NEXT: vadd.f32 q0, q2, q0 917; CHECK-NEXT: vstrw.32 q0, [r1] 918; CHECK-NEXT: vpop {d8, d9} 919; CHECK-NEXT: bx lr 920entry: 921 %l1 = load <12 x float>, <12 x float>* %src, align 4 922 %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 923 %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 924 %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 925 %a1 = fadd <4 x float> %s1, %s2 926 %a = fadd <4 x float> %a1, %s3 927 store <4 x float> %a, <4 x float> *%dst 928 ret void 929} 930 931define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) { 932; CHECK-LABEL: vld3_v8f32: 933; CHECK: @ %bb.0: @ %entry 934; CHECK-NEXT: .vsave {d8, d9, d10, d11} 935; CHECK-NEXT: vpush {d8, d9, d10, d11} 936; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 937; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 938; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 939; CHECK-NEXT: vmov.f64 d4, d2 940; CHECK-NEXT: vmov.f32 s12, s5 941; CHECK-NEXT: vmov.f32 s9, s7 942; CHECK-NEXT: vmov.f32 s13, s0 943; CHECK-NEXT: vmov.f32 s10, s2 944; CHECK-NEXT: vmov.f32 s14, s3 945; CHECK-NEXT: vmov.f32 s0, s6 946; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 947; CHECK-NEXT: vmov.f32 s2, s16 948; CHECK-NEXT: vmov.f32 s15, s18 949; CHECK-NEXT: vmov.f32 s11, s17 950; CHECK-NEXT: vadd.f32 q2, q2, q3 951; CHECK-NEXT: vmov.f32 s3, s19 952; CHECK-NEXT: vadd.f32 q0, q2, q0 953; CHECK-NEXT: vldrw.u32 q2, [r0] 954; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 955; CHECK-NEXT: vstrw.32 q0, [r1, #16] 956; CHECK-NEXT: vmov.f32 s16, s9 957; CHECK-NEXT: vmov.f64 d10, d4 958; CHECK-NEXT: vmov.f32 s17, s4 959; CHECK-NEXT: vmov.f32 s21, s11 960; CHECK-NEXT: vmov.f32 s18, s7 961; CHECK-NEXT: vmov.f32 s22, s6 962; CHECK-NEXT: vmov.f32 s4, s10 963; CHECK-NEXT: vmov.f32 s6, s12 964; CHECK-NEXT: vmov.f32 s19, s14 965; CHECK-NEXT: vmov.f32 s23, s13 966; CHECK-NEXT: vadd.f32 q4, q5, q4 967; CHECK-NEXT: vmov.f32 s7, s15 968; CHECK-NEXT: vadd.f32 q1, q4, q1 969; CHECK-NEXT: vstrw.32 q1, [r1] 970; CHECK-NEXT: vpop {d8, d9, d10, d11} 971; CHECK-NEXT: bx lr 972entry: 973 %l1 = load <24 x float>, <24 x float>* %src, align 4 974 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 975 %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 976 %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 977 %a1 = fadd <8 x float> %s1, %s2 978 %a = fadd <8 x float> %a1, %s3 979 store <8 x float> %a, <8 x float> *%dst 980 ret void 981} 982 983define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) { 984; CHECK-LABEL: vld3_v16f32: 985; CHECK: @ %bb.0: @ %entry 986; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 987; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 988; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 989; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 990; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 991; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 992; CHECK-NEXT: vmov.f64 d4, d2 993; CHECK-NEXT: vmov.f32 s12, s5 994; CHECK-NEXT: vmov.f32 s9, s7 995; CHECK-NEXT: vmov.f32 s13, s0 996; CHECK-NEXT: vmov.f32 s10, s2 997; CHECK-NEXT: vmov.f32 s14, s3 998; CHECK-NEXT: vmov.f32 s0, s6 999; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1000; CHECK-NEXT: vmov.f32 s2, s16 1001; CHECK-NEXT: vmov.f32 s15, s18 1002; CHECK-NEXT: vmov.f32 s11, s17 1003; CHECK-NEXT: vadd.f32 q2, q2, q3 1004; CHECK-NEXT: vmov.f32 s3, s19 1005; CHECK-NEXT: vadd.f32 q0, q2, q0 1006; CHECK-NEXT: vldrw.u32 q2, [r0] 1007; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1008; CHECK-NEXT: vmov.f32 s16, s9 1009; CHECK-NEXT: vmov.f64 d10, d4 1010; CHECK-NEXT: vmov.f32 s17, s4 1011; CHECK-NEXT: vmov.f32 s21, s11 1012; CHECK-NEXT: vmov.f32 s18, s7 1013; CHECK-NEXT: vmov.f32 s22, s6 1014; CHECK-NEXT: vmov.f32 s4, s10 1015; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 1016; CHECK-NEXT: vmov.f32 s6, s12 1017; CHECK-NEXT: vmov.f32 s19, s14 1018; CHECK-NEXT: vmov.f32 s23, s13 1019; CHECK-NEXT: vmov.f32 s7, s15 1020; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 1021; CHECK-NEXT: vadd.f32 q4, q5, q4 1022; CHECK-NEXT: vmov.f32 s20, s13 1023; CHECK-NEXT: vadd.f32 q1, q4, q1 1024; CHECK-NEXT: vmov.f64 d8, d6 1025; CHECK-NEXT: vmov.f32 s17, s15 1026; CHECK-NEXT: vmov.f32 s21, s8 1027; CHECK-NEXT: vmov.f32 s18, s10 1028; CHECK-NEXT: vmov.f32 s22, s11 1029; CHECK-NEXT: vmov.f32 s8, s14 1030; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 1031; CHECK-NEXT: vmov.f32 s10, s24 1032; CHECK-NEXT: vmov.f32 s23, s26 1033; CHECK-NEXT: vmov.f32 s19, s25 1034; CHECK-NEXT: vadd.f32 q4, q4, q5 1035; CHECK-NEXT: vmov.f32 s11, s27 1036; CHECK-NEXT: vadd.f32 q2, q4, q2 1037; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 1038; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 1039; CHECK-NEXT: vstrw.32 q2, [r1, #48] 1040; CHECK-NEXT: vmov.f32 s24, s17 1041; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1042; CHECK-NEXT: vmov.f64 d14, d8 1043; CHECK-NEXT: vstrw.32 q1, [r1] 1044; CHECK-NEXT: vmov.f32 s25, s12 1045; CHECK-NEXT: vmov.f32 s29, s19 1046; CHECK-NEXT: vmov.f32 s26, s15 1047; CHECK-NEXT: vmov.f32 s30, s14 1048; CHECK-NEXT: vmov.f32 s12, s18 1049; CHECK-NEXT: vmov.f32 s14, s20 1050; CHECK-NEXT: vmov.f32 s27, s22 1051; CHECK-NEXT: vmov.f32 s31, s21 1052; CHECK-NEXT: vadd.f32 q6, q7, q6 1053; CHECK-NEXT: vmov.f32 s15, s23 1054; CHECK-NEXT: vadd.f32 q3, q6, q3 1055; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1056; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1057; CHECK-NEXT: bx lr 1058entry: 1059 %l1 = load <48 x float>, <48 x float>* %src, align 4 1060 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1061 %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1062 %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1063 %a1 = fadd <16 x float> %s1, %s2 1064 %a = fadd <16 x float> %a1, %s3 1065 store <16 x float> %a, <16 x float> *%dst 1066 ret void 1067} 1068 1069; f16 1070 1071define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { 1072; CHECK-LABEL: vld3_v2f16: 1073; CHECK: @ %bb.0: @ %entry 1074; CHECK-NEXT: ldrd r2, r3, [r0] 1075; CHECK-NEXT: ldr r0, [r0, #8] 1076; CHECK-NEXT: vmov.32 q0[0], r2 1077; CHECK-NEXT: vmov.32 q0[1], r3 1078; CHECK-NEXT: vmov.32 q0[2], r0 1079; CHECK-NEXT: vmovx.f16 s8, s0 1080; CHECK-NEXT: vmovx.f16 s4, s2 1081; CHECK-NEXT: vins.f16 s8, s2 1082; CHECK-NEXT: vmovx.f16 s6, s1 1083; CHECK-NEXT: vins.f16 s1, s4 1084; CHECK-NEXT: vins.f16 s0, s6 1085; CHECK-NEXT: vadd.f16 q1, q0, q2 1086; CHECK-NEXT: vmov.f32 s0, s1 1087; CHECK-NEXT: vadd.f16 q0, q1, q0 1088; CHECK-NEXT: vmov r0, s0 1089; CHECK-NEXT: str r0, [r1] 1090; CHECK-NEXT: bx lr 1091entry: 1092 %l1 = load <6 x half>, <6 x half>* %src, align 4 1093 %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3> 1094 %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4> 1095 %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5> 1096 %a1 = fadd <2 x half> %s1, %s2 1097 %a = fadd <2 x half> %a1, %s3 1098 store <2 x half> %a, <2 x half> *%dst 1099 ret void 1100} 1101 1102define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { 1103; CHECK-LABEL: vld3_v4f16: 1104; CHECK: @ %bb.0: @ %entry 1105; CHECK-NEXT: .vsave {d8} 1106; CHECK-NEXT: vpush {d8} 1107; CHECK-NEXT: ldrd r2, r3, [r0, #16] 1108; CHECK-NEXT: vmov.32 q2[0], r2 1109; CHECK-NEXT: vmov.32 q2[1], r3 1110; CHECK-NEXT: vmov.f32 s1, s8 1111; CHECK-NEXT: vmovx.f16 s4, s9 1112; CHECK-NEXT: vins.f16 s1, s4 1113; CHECK-NEXT: vldrw.u32 q1, [r0] 1114; CHECK-NEXT: vmovx.f16 s8, s8 1115; CHECK-NEXT: vmovx.f16 s12, s4 1116; CHECK-NEXT: vmovx.f16 s16, s5 1117; CHECK-NEXT: vins.f16 s12, s6 1118; CHECK-NEXT: vins.f16 s4, s16 1119; CHECK-NEXT: vmovx.f16 s16, s6 1120; CHECK-NEXT: vins.f16 s5, s16 1121; CHECK-NEXT: vmovx.f16 s13, s7 1122; CHECK-NEXT: vins.f16 s7, s8 1123; CHECK-NEXT: vmov.f32 s0, s5 1124; CHECK-NEXT: vins.f16 s13, s9 1125; CHECK-NEXT: vmov.f32 s5, s7 1126; CHECK-NEXT: vadd.f16 q1, q1, q3 1127; CHECK-NEXT: vadd.f16 q0, q1, q0 1128; CHECK-NEXT: vmov r0, r2, d0 1129; CHECK-NEXT: strd r0, r2, [r1] 1130; CHECK-NEXT: vpop {d8} 1131; CHECK-NEXT: bx lr 1132entry: 1133 %l1 = load <12 x half>, <12 x half>* %src, align 4 1134 %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1135 %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1136 %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1137 %a1 = fadd <4 x half> %s1, %s2 1138 %a = fadd <4 x half> %a1, %s3 1139 store <4 x half> %a, <4 x half> *%dst 1140 ret void 1141} 1142 1143define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { 1144; CHECK-LABEL: vld3_v8f16: 1145; CHECK: @ %bb.0: @ %entry 1146; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1147; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1148; CHECK-NEXT: vldrw.u32 q0, [r0] 1149; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 1150; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1151; CHECK-NEXT: vmovx.f16 s8, s2 1152; CHECK-NEXT: vmov.f32 s4, s1 1153; CHECK-NEXT: vins.f16 s4, s8 1154; CHECK-NEXT: vmovx.f16 s8, s17 1155; CHECK-NEXT: vmov.f32 s5, s16 1156; CHECK-NEXT: vmovx.f16 s20, s15 1157; CHECK-NEXT: vins.f16 s5, s8 1158; CHECK-NEXT: vmov.f32 s11, s14 1159; CHECK-NEXT: vins.f16 s11, s20 1160; CHECK-NEXT: vmov.f32 s6, s19 1161; CHECK-NEXT: vmovx.f16 s20, s12 1162; CHECK-NEXT: vmov.f32 s28, s18 1163; CHECK-NEXT: vins.f16 s6, s20 1164; CHECK-NEXT: vmovx.f16 s20, s19 1165; CHECK-NEXT: vins.f16 s28, s20 1166; CHECK-NEXT: vmovx.f16 s24, s1 1167; CHECK-NEXT: vmovx.f16 s20, s0 1168; CHECK-NEXT: vins.f16 s0, s24 1169; CHECK-NEXT: vins.f16 s20, s2 1170; CHECK-NEXT: vmovx.f16 s26, s16 1171; CHECK-NEXT: vmovx.f16 s21, s3 1172; CHECK-NEXT: vins.f16 s3, s26 1173; CHECK-NEXT: vins.f16 s21, s17 1174; CHECK-NEXT: vmovx.f16 s30, s14 1175; CHECK-NEXT: vmovx.f16 s23, s13 1176; CHECK-NEXT: vmov.f32 s10, s12 1177; CHECK-NEXT: vmov.f32 s1, s3 1178; CHECK-NEXT: vins.f16 s13, s30 1179; CHECK-NEXT: vins.f16 s23, s15 1180; CHECK-NEXT: vmov.f32 s2, s28 1181; CHECK-NEXT: vmovx.f16 s22, s18 1182; CHECK-NEXT: vmov.f32 s3, s13 1183; CHECK-NEXT: vins.f16 s22, s12 1184; CHECK-NEXT: vmov.f32 s7, s11 1185; CHECK-NEXT: vadd.f16 q0, q0, q5 1186; CHECK-NEXT: vadd.f16 q0, q0, q1 1187; CHECK-NEXT: vstrw.32 q0, [r1] 1188; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1189; CHECK-NEXT: bx lr 1190entry: 1191 %l1 = load <24 x half>, <24 x half>* %src, align 4 1192 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1193 %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1194 %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1195 %a1 = fadd <8 x half> %s1, %s2 1196 %a = fadd <8 x half> %a1, %s3 1197 store <8 x half> %a, <8 x half> *%dst 1198 ret void 1199} 1200 1201define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { 1202; CHECK-LABEL: vld3_v16f16: 1203; CHECK: @ %bb.0: @ %entry 1204; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1205; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1206; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 1207; CHECK-NEXT: vldrw.u32 q3, [r0, #64] 1208; CHECK-NEXT: vmovx.f16 s8, s2 1209; CHECK-NEXT: vmov.f32 s4, s1 1210; CHECK-NEXT: vins.f16 s4, s8 1211; CHECK-NEXT: vmovx.f16 s8, s13 1212; CHECK-NEXT: vmov.f32 s5, s12 1213; CHECK-NEXT: vmovx.f16 s24, s1 1214; CHECK-NEXT: vins.f16 s5, s8 1215; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 1216; CHECK-NEXT: vmov.f32 s6, s15 1217; CHECK-NEXT: vmovx.f16 s26, s12 1218; CHECK-NEXT: vmovx.f16 s20, s11 1219; CHECK-NEXT: vmov.f32 s19, s10 1220; CHECK-NEXT: vins.f16 s19, s20 1221; CHECK-NEXT: vmovx.f16 s20, s8 1222; CHECK-NEXT: vins.f16 s6, s20 1223; CHECK-NEXT: vmovx.f16 s20, s15 1224; CHECK-NEXT: vmov.f32 s28, s14 1225; CHECK-NEXT: vmovx.f16 s30, s10 1226; CHECK-NEXT: vins.f16 s28, s20 1227; CHECK-NEXT: vmovx.f16 s20, s0 1228; CHECK-NEXT: vins.f16 s0, s24 1229; CHECK-NEXT: vins.f16 s20, s2 1230; CHECK-NEXT: vmovx.f16 s21, s3 1231; CHECK-NEXT: vins.f16 s3, s26 1232; CHECK-NEXT: vins.f16 s21, s13 1233; CHECK-NEXT: vmov.f32 s18, s8 1234; CHECK-NEXT: vmovx.f16 s23, s9 1235; CHECK-NEXT: vmov.f32 s1, s3 1236; CHECK-NEXT: vins.f16 s9, s30 1237; CHECK-NEXT: vins.f16 s23, s11 1238; CHECK-NEXT: vmovx.f16 s22, s14 1239; CHECK-NEXT: vmov.f32 s2, s28 1240; CHECK-NEXT: vins.f16 s22, s8 1241; CHECK-NEXT: vmov.f32 s3, s9 1242; CHECK-NEXT: vmov.f32 s7, s19 1243; CHECK-NEXT: vadd.f16 q0, q0, q5 1244; CHECK-NEXT: vadd.f16 q1, q0, q1 1245; CHECK-NEXT: vldrw.u32 q0, [r0] 1246; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 1247; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1248; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1249; CHECK-NEXT: vmovx.f16 s16, s2 1250; CHECK-NEXT: vmov.f32 s4, s1 1251; CHECK-NEXT: vmovx.f16 s20, s11 1252; CHECK-NEXT: vins.f16 s4, s16 1253; CHECK-NEXT: vmovx.f16 s16, s13 1254; CHECK-NEXT: vmov.f32 s5, s12 1255; CHECK-NEXT: vmovx.f16 s24, s1 1256; CHECK-NEXT: vins.f16 s5, s16 1257; CHECK-NEXT: vmov.f32 s19, s10 1258; CHECK-NEXT: vins.f16 s19, s20 1259; CHECK-NEXT: vmov.f32 s6, s15 1260; CHECK-NEXT: vmovx.f16 s20, s8 1261; CHECK-NEXT: vmov.f32 s28, s14 1262; CHECK-NEXT: vins.f16 s6, s20 1263; CHECK-NEXT: vmovx.f16 s20, s15 1264; CHECK-NEXT: vins.f16 s28, s20 1265; CHECK-NEXT: vmovx.f16 s20, s0 1266; CHECK-NEXT: vins.f16 s0, s24 1267; CHECK-NEXT: vins.f16 s20, s2 1268; CHECK-NEXT: vmovx.f16 s21, s3 1269; CHECK-NEXT: vmovx.f16 s26, s12 1270; CHECK-NEXT: vins.f16 s21, s13 1271; CHECK-NEXT: vins.f16 s3, s26 1272; CHECK-NEXT: vmovx.f16 s30, s10 1273; CHECK-NEXT: vmovx.f16 s23, s9 1274; CHECK-NEXT: vmov.f32 s18, s8 1275; CHECK-NEXT: vins.f16 s9, s30 1276; CHECK-NEXT: vins.f16 s23, s11 1277; CHECK-NEXT: vmov.f32 s1, s3 1278; CHECK-NEXT: vmovx.f16 s22, s14 1279; CHECK-NEXT: vmov.f32 s2, s28 1280; CHECK-NEXT: vins.f16 s22, s8 1281; CHECK-NEXT: vmov.f32 s3, s9 1282; CHECK-NEXT: vmov.f32 s7, s19 1283; CHECK-NEXT: vadd.f16 q0, q0, q5 1284; CHECK-NEXT: vadd.f16 q0, q0, q1 1285; CHECK-NEXT: vstrw.32 q0, [r1] 1286; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1287; CHECK-NEXT: bx lr 1288entry: 1289 %l1 = load <48 x half>, <48 x half>* %src, align 4 1290 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1291 %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1292 %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1293 %a1 = fadd <16 x half> %s1, %s2 1294 %a = fadd <16 x half> %a1, %s3 1295 store <16 x half> %a, <16 x half> *%dst 1296 ret void 1297} 1298 1299; f64 1300 1301define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) { 1302; CHECK-LABEL: vld3_v2f64: 1303; CHECK: @ %bb.0: @ %entry 1304; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1305; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1306; CHECK-NEXT: vldrw.u32 q3, [r0] 1307; CHECK-NEXT: vadd.f64 d4, d3, d0 1308; CHECK-NEXT: vadd.f64 d5, d6, d7 1309; CHECK-NEXT: vadd.f64 d1, d4, d1 1310; CHECK-NEXT: vadd.f64 d0, d5, d2 1311; CHECK-NEXT: vstrw.32 q0, [r1] 1312; CHECK-NEXT: bx lr 1313entry: 1314 %l1 = load <6 x double>, <6 x double>* %src, align 4 1315 %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3> 1316 %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4> 1317 %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5> 1318 %a1 = fadd <2 x double> %s1, %s2 1319 %a = fadd <2 x double> %a1, %s3 1320 store <2 x double> %a, <2 x double> *%dst 1321 ret void 1322} 1323 1324define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) { 1325; CHECK-LABEL: vld3_v4f64: 1326; CHECK: @ %bb.0: @ %entry 1327; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1328; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1329; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 1330; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 1331; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1332; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 1333; CHECK-NEXT: vadd.f64 d5, d6, d7 1334; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1335; CHECK-NEXT: vldrw.u32 q6, [r0] 1336; CHECK-NEXT: vadd.f64 d4, d1, d2 1337; CHECK-NEXT: vadd.f64 d10, d9, d6 1338; CHECK-NEXT: vadd.f64 d11, d12, d13 1339; CHECK-NEXT: vadd.f64 d3, d4, d3 1340; CHECK-NEXT: vadd.f64 d2, d5, d0 1341; CHECK-NEXT: vadd.f64 d1, d10, d7 1342; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1343; CHECK-NEXT: vadd.f64 d0, d11, d8 1344; CHECK-NEXT: vstrw.32 q0, [r1] 1345; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1346; CHECK-NEXT: bx lr 1347entry: 1348 %l1 = load <12 x double>, <12 x double>* %src, align 4 1349 %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1350 %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1351 %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1352 %a1 = fadd <4 x double> %s1, %s2 1353 %a = fadd <4 x double> %a1, %s3 1354 store <4 x double> %a, <4 x double> *%dst 1355 ret void 1356} 1357