1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C) { 5; CHECK-LABEL: loads_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, r5, r6, lr} 8; CHECK-NEXT: push {r4, r5, r6, lr} 9; CHECK-NEXT: .vsave {d8, d9, d10, d11} 10; CHECK-NEXT: vpush {d8, d9, d10, d11} 11; CHECK-NEXT: vldrw.u32 q1, [r1] 12; CHECK-NEXT: vmov.i64 q0, #0xffffffff 13; CHECK-NEXT: vldrw.u32 q5, [r2] 14; CHECK-NEXT: vmov.f32 s8, s6 15; CHECK-NEXT: vmov.f32 s10, s7 16; CHECK-NEXT: vmov.f32 s6, s5 17; CHECK-NEXT: vand q2, q2, q0 18; CHECK-NEXT: vand q0, q1, q0 19; CHECK-NEXT: vldrw.u32 q1, [r0] 20; CHECK-NEXT: vmov r4, r1, d4 21; CHECK-NEXT: vmov.f32 s12, s6 22; CHECK-NEXT: vmov.f32 s14, s7 23; CHECK-NEXT: vmov r5, s12 24; CHECK-NEXT: vmov.f32 s16, s22 25; CHECK-NEXT: vmov.f32 s18, s23 26; CHECK-NEXT: vmov r3, lr, d0 27; CHECK-NEXT: vmov.f32 s6, s5 28; CHECK-NEXT: vmov r0, r12, d5 29; CHECK-NEXT: vmov.f32 s8, s20 30; CHECK-NEXT: vmov.f32 s10, s21 31; CHECK-NEXT: adds r2, r5, r4 32; CHECK-NEXT: vmov r4, s16 33; CHECK-NEXT: asr.w r6, r5, #31 34; CHECK-NEXT: adcs r1, r6 35; CHECK-NEXT: asrl r2, r1, r4 36; CHECK-NEXT: vmov r1, s4 37; CHECK-NEXT: adds r6, r1, r3 38; CHECK-NEXT: vmov r3, s8 39; CHECK-NEXT: asr.w r4, r1, #31 40; CHECK-NEXT: adc.w r1, r4, lr 41; CHECK-NEXT: asrl r6, r1, r3 42; CHECK-NEXT: vmov r5, r4, d1 43; CHECK-NEXT: vmov r1, s14 44; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 45; CHECK-NEXT: adds r0, r0, r1 46; CHECK-NEXT: asr.w r3, r1, #31 47; CHECK-NEXT: adc.w r1, r3, r12 48; CHECK-NEXT: vmov r3, s18 49; CHECK-NEXT: asrl r0, r1, r3 50; CHECK-NEXT: vmov r1, s6 51; CHECK-NEXT: adds r6, r1, r5 52; CHECK-NEXT: asr.w r2, r1, #31 53; CHECK-NEXT: adc.w r1, r2, r4 54; CHECK-NEXT: vmov r2, s10 55; CHECK-NEXT: asrl r6, r1, r2 56; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 57; CHECK-NEXT: vpop {d8, d9, d10, d11} 58; CHECK-NEXT: pop {r4, r5, r6, pc} 59entry: 60 %a = load <4 x i32>, <4 x i32> *%A, align 4 61 %b = load <4 x i32>, <4 x i32> *%B, align 4 62 %c = load <4 x i32>, <4 x i32> *%C, align 4 63 %sa = sext <4 x i32> %a to <4 x i64> 64 %sb = zext <4 x i32> %b to <4 x i64> 65 %sc = zext <4 x i32> %c to <4 x i64> 66 %add = add <4 x i64> %sa, %sb 67 %sh = ashr <4 x i64> %add, %sc 68 %t = trunc <4 x i64> %sh to <4 x i32> 69 ret <4 x i32> %t 70} 71 72define arm_aapcs_vfpcc <8 x i16> @loads_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C) { 73; CHECK-LABEL: loads_i16: 74; CHECK: @ %bb.0: @ %entry 75; CHECK-NEXT: vldrw.u32 q0, [r1] 76; CHECK-NEXT: vldrw.u32 q2, [r0] 77; CHECK-NEXT: vmovlb.s16 q1, q0 78; CHECK-NEXT: vmovlb.s16 q3, q2 79; CHECK-NEXT: vmovlt.s16 q0, q0 80; CHECK-NEXT: vmovlt.s16 q2, q2 81; CHECK-NEXT: vadd.i32 q0, q2, q0 82; CHECK-NEXT: vldrw.u32 q2, [r2] 83; CHECK-NEXT: vadd.i32 q1, q3, q1 84; CHECK-NEXT: vmovlt.u16 q3, q2 85; CHECK-NEXT: vneg.s32 q3, q3 86; CHECK-NEXT: vshl.s32 q3, q0, q3 87; CHECK-NEXT: vmovlb.u16 q0, q2 88; CHECK-NEXT: vneg.s32 q0, q0 89; CHECK-NEXT: vshl.s32 q0, q1, q0 90; CHECK-NEXT: vmovnt.i32 q0, q3 91; CHECK-NEXT: bx lr 92entry: 93 %a = load <8 x i16>, <8 x i16> *%A, align 4 94 %b = load <8 x i16>, <8 x i16> *%B, align 4 95 %c = load <8 x i16>, <8 x i16> *%C, align 4 96 %sa = sext <8 x i16> %a to <8 x i32> 97 %sb = sext <8 x i16> %b to <8 x i32> 98 %sc = zext <8 x i16> %c to <8 x i32> 99 %add = add <8 x i32> %sa, %sb 100 %sh = ashr <8 x i32> %add, %sc 101 %t = trunc <8 x i32> %sh to <8 x i16> 102 ret <8 x i16> %t 103} 104 105define arm_aapcs_vfpcc <16 x i8> @loads_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C) { 106; CHECK-LABEL: loads_i8: 107; CHECK: @ %bb.0: @ %entry 108; CHECK-NEXT: vldrw.u32 q0, [r1] 109; CHECK-NEXT: vldrw.u32 q2, [r0] 110; CHECK-NEXT: vmovlb.s8 q1, q0 111; CHECK-NEXT: vmovlb.s8 q3, q2 112; CHECK-NEXT: vmovlt.s8 q0, q0 113; CHECK-NEXT: vmovlt.s8 q2, q2 114; CHECK-NEXT: vadd.i16 q0, q2, q0 115; CHECK-NEXT: vldrw.u32 q2, [r2] 116; CHECK-NEXT: vadd.i16 q1, q3, q1 117; CHECK-NEXT: vmovlt.u8 q3, q2 118; CHECK-NEXT: vneg.s16 q3, q3 119; CHECK-NEXT: vshl.s16 q3, q0, q3 120; CHECK-NEXT: vmovlb.u8 q0, q2 121; CHECK-NEXT: vneg.s16 q0, q0 122; CHECK-NEXT: vshl.s16 q0, q1, q0 123; CHECK-NEXT: vmovnt.i16 q0, q3 124; CHECK-NEXT: bx lr 125entry: 126 %a = load <16 x i8>, <16 x i8> *%A, align 4 127 %b = load <16 x i8>, <16 x i8> *%B, align 4 128 %c = load <16 x i8>, <16 x i8> *%C, align 4 129 %sa = sext <16 x i8> %a to <16 x i16> 130 %sb = sext <16 x i8> %b to <16 x i16> 131 %sc = zext <16 x i8> %c to <16 x i16> 132 %add = add <16 x i16> %sa, %sb 133 %sh = ashr <16 x i16> %add, %sc 134 %t = trunc <16 x i16> %sh to <16 x i8> 135 ret <16 x i8> %t 136} 137 138define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) { 139; CHECK-LABEL: load_store_i32: 140; CHECK: @ %bb.0: @ %entry 141; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 142; CHECK-NEXT: push {r4, r5, r6, r7, lr} 143; CHECK-NEXT: .pad #4 144; CHECK-NEXT: sub sp, #4 145; CHECK-NEXT: .vsave {d8, d9, d10, d11} 146; CHECK-NEXT: vpush {d8, d9, d10, d11} 147; CHECK-NEXT: vldrw.u32 q2, [r1] 148; CHECK-NEXT: vmov.i64 q0, #0xffffffff 149; CHECK-NEXT: vldrw.u32 q5, [r2] 150; CHECK-NEXT: vmov.f32 s4, s10 151; CHECK-NEXT: vmov.f32 s6, s11 152; CHECK-NEXT: vmov.f32 s10, s9 153; CHECK-NEXT: vand q1, q1, q0 154; CHECK-NEXT: vand q2, q2, q0 155; CHECK-NEXT: vldrw.u32 q0, [r0] 156; CHECK-NEXT: vmov r5, r1, d2 157; CHECK-NEXT: vmov.f32 s12, s2 158; CHECK-NEXT: vmov.f32 s14, s3 159; CHECK-NEXT: vmov r6, s12 160; CHECK-NEXT: vmov.f32 s16, s22 161; CHECK-NEXT: vmov.f32 s18, s23 162; CHECK-NEXT: vmov r4, lr, d4 163; CHECK-NEXT: vmov.f32 s2, s1 164; CHECK-NEXT: vmov r0, r12, d3 165; CHECK-NEXT: vmov.f32 s4, s20 166; CHECK-NEXT: vmov.f32 s6, s21 167; CHECK-NEXT: adds r2, r6, r5 168; CHECK-NEXT: vmov r5, s16 169; CHECK-NEXT: asr.w r7, r6, #31 170; CHECK-NEXT: adcs r1, r7 171; CHECK-NEXT: asrl r2, r1, r5 172; CHECK-NEXT: vmov r7, s4 173; CHECK-NEXT: vmov r1, s0 174; CHECK-NEXT: adds r4, r4, r1 175; CHECK-NEXT: asr.w r5, r1, #31 176; CHECK-NEXT: adc.w r1, r5, lr 177; CHECK-NEXT: asrl r4, r1, r7 178; CHECK-NEXT: vmov r6, r5, d5 179; CHECK-NEXT: vmov r1, s14 180; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 181; CHECK-NEXT: adds r0, r0, r1 182; CHECK-NEXT: asr.w r7, r1, #31 183; CHECK-NEXT: adc.w r1, r7, r12 184; CHECK-NEXT: vmov r7, s18 185; CHECK-NEXT: asrl r0, r1, r7 186; CHECK-NEXT: vmov r1, s2 187; CHECK-NEXT: adds r6, r6, r1 188; CHECK-NEXT: asr.w r2, r1, #31 189; CHECK-NEXT: adc.w r1, r2, r5 190; CHECK-NEXT: vmov r2, s6 191; CHECK-NEXT: asrl r6, r1, r2 192; CHECK-NEXT: vmov q2[3], q2[1], r6, r0 193; CHECK-NEXT: vstrw.32 q2, [r3] 194; CHECK-NEXT: vpop {d8, d9, d10, d11} 195; CHECK-NEXT: add sp, #4 196; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 197entry: 198 %a = load <4 x i32>, <4 x i32> *%A, align 4 199 %b = load <4 x i32>, <4 x i32> *%B, align 4 200 %c = load <4 x i32>, <4 x i32> *%C, align 4 201 %sa = sext <4 x i32> %a to <4 x i64> 202 %sb = zext <4 x i32> %b to <4 x i64> 203 %sc = zext <4 x i32> %c to <4 x i64> 204 %add = add <4 x i64> %sa, %sb 205 %sh = ashr <4 x i64> %add, %sc 206 %t = trunc <4 x i64> %sh to <4 x i32> 207 store <4 x i32> %t, <4 x i32> *%D, align 4 208 ret void 209} 210 211define arm_aapcs_vfpcc void @load_store_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C, <8 x i16> *%D) { 212; CHECK-LABEL: load_store_i16: 213; CHECK: @ %bb.0: @ %entry 214; CHECK-NEXT: vldrh.s32 q0, [r1, #8] 215; CHECK-NEXT: vldrh.s32 q1, [r0, #8] 216; CHECK-NEXT: vldrh.s32 q2, [r0] 217; CHECK-NEXT: vadd.i32 q0, q1, q0 218; CHECK-NEXT: vldrh.u32 q1, [r2, #8] 219; CHECK-NEXT: vneg.s32 q1, q1 220; CHECK-NEXT: vshl.s32 q0, q0, q1 221; CHECK-NEXT: vldrh.s32 q1, [r1] 222; CHECK-NEXT: vadd.i32 q1, q2, q1 223; CHECK-NEXT: vldrh.u32 q2, [r2] 224; CHECK-NEXT: vstrh.32 q0, [r3, #8] 225; CHECK-NEXT: vneg.s32 q2, q2 226; CHECK-NEXT: vshl.s32 q1, q1, q2 227; CHECK-NEXT: vstrh.32 q1, [r3] 228; CHECK-NEXT: bx lr 229entry: 230 %a = load <8 x i16>, <8 x i16> *%A, align 4 231 %b = load <8 x i16>, <8 x i16> *%B, align 4 232 %c = load <8 x i16>, <8 x i16> *%C, align 4 233 %sa = sext <8 x i16> %a to <8 x i32> 234 %sb = sext <8 x i16> %b to <8 x i32> 235 %sc = zext <8 x i16> %c to <8 x i32> 236 %add = add <8 x i32> %sa, %sb 237 %sh = ashr <8 x i32> %add, %sc 238 %t = trunc <8 x i32> %sh to <8 x i16> 239 store <8 x i16> %t, <8 x i16> *%D, align 4 240 ret void 241} 242 243define arm_aapcs_vfpcc void @load_store_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C, <16 x i8> *%D) { 244; CHECK-LABEL: load_store_i8: 245; CHECK: @ %bb.0: @ %entry 246; CHECK-NEXT: vldrb.s16 q0, [r1, #8] 247; CHECK-NEXT: vldrb.s16 q1, [r0, #8] 248; CHECK-NEXT: vldrb.s16 q2, [r0] 249; CHECK-NEXT: vadd.i16 q0, q1, q0 250; CHECK-NEXT: vldrb.u16 q1, [r2, #8] 251; CHECK-NEXT: vneg.s16 q1, q1 252; CHECK-NEXT: vshl.s16 q0, q0, q1 253; CHECK-NEXT: vldrb.s16 q1, [r1] 254; CHECK-NEXT: vadd.i16 q1, q2, q1 255; CHECK-NEXT: vldrb.u16 q2, [r2] 256; CHECK-NEXT: vstrb.16 q0, [r3, #8] 257; CHECK-NEXT: vneg.s16 q2, q2 258; CHECK-NEXT: vshl.s16 q1, q1, q2 259; CHECK-NEXT: vstrb.16 q1, [r3] 260; CHECK-NEXT: bx lr 261entry: 262 %a = load <16 x i8>, <16 x i8> *%A, align 4 263 %b = load <16 x i8>, <16 x i8> *%B, align 4 264 %c = load <16 x i8>, <16 x i8> *%C, align 4 265 %sa = sext <16 x i8> %a to <16 x i16> 266 %sb = sext <16 x i8> %b to <16 x i16> 267 %sc = zext <16 x i8> %c to <16 x i16> 268 %add = add <16 x i16> %sa, %sb 269 %sh = ashr <16 x i16> %add, %sc 270 %t = trunc <16 x i16> %sh to <16 x i8> 271 store <16 x i8> %t, <16 x i8> *%D, align 4 272 ret void 273} 274 275 276define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) { 277; CHECK-LABEL: load_one_store_i32: 278; CHECK: @ %bb.0: @ %entry 279; CHECK-NEXT: .save {r4, r5, r7, lr} 280; CHECK-NEXT: push {r4, r5, r7, lr} 281; CHECK-NEXT: vldrw.u32 q0, [r0] 282; CHECK-NEXT: vmov.f32 s4, s2 283; CHECK-NEXT: vmov.f32 s6, s3 284; CHECK-NEXT: vmov.f32 s2, s1 285; CHECK-NEXT: vmov r2, s6 286; CHECK-NEXT: adds.w r12, r2, r2 287; CHECK-NEXT: asr.w r3, r2, #31 288; CHECK-NEXT: adc.w r7, r3, r2, asr #31 289; CHECK-NEXT: vmov r3, s4 290; CHECK-NEXT: asrl r12, r7, r2 291; CHECK-NEXT: adds r0, r3, r3 292; CHECK-NEXT: asr.w r5, r3, #31 293; CHECK-NEXT: adc.w r5, r5, r3, asr #31 294; CHECK-NEXT: asrl r0, r5, r3 295; CHECK-NEXT: vmov r3, s0 296; CHECK-NEXT: adds r4, r3, r3 297; CHECK-NEXT: asr.w r5, r3, #31 298; CHECK-NEXT: adc.w r5, r5, r3, asr #31 299; CHECK-NEXT: asrl r4, r5, r3 300; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 301; CHECK-NEXT: vmov r0, s2 302; CHECK-NEXT: adds r4, r0, r0 303; CHECK-NEXT: asr.w r2, r0, #31 304; CHECK-NEXT: adc.w r3, r2, r0, asr #31 305; CHECK-NEXT: asrl r4, r3, r0 306; CHECK-NEXT: vmov q1[3], q1[1], r4, r12 307; CHECK-NEXT: vstrw.32 q1, [r1] 308; CHECK-NEXT: pop {r4, r5, r7, pc} 309entry: 310 %a = load <4 x i32>, <4 x i32> *%A, align 4 311 %sa = sext <4 x i32> %a to <4 x i64> 312 %add = add <4 x i64> %sa, %sa 313 %sh = ashr <4 x i64> %add, %sa 314 %t = trunc <4 x i64> %sh to <4 x i32> 315 store <4 x i32> %t, <4 x i32> *%D, align 4 316 ret void 317} 318 319define arm_aapcs_vfpcc void @load_one_store_i16(<8 x i16> *%A, <8 x i16> *%D) { 320; CHECK-LABEL: load_one_store_i16: 321; CHECK: @ %bb.0: @ %entry 322; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 323; CHECK-NEXT: vneg.s32 q1, q0 324; CHECK-NEXT: vadd.i32 q0, q0, q0 325; CHECK-NEXT: vshl.s32 q0, q0, q1 326; CHECK-NEXT: vldrh.s32 q1, [r0] 327; CHECK-NEXT: vstrh.32 q0, [r1, #8] 328; CHECK-NEXT: vneg.s32 q2, q1 329; CHECK-NEXT: vadd.i32 q1, q1, q1 330; CHECK-NEXT: vshl.s32 q1, q1, q2 331; CHECK-NEXT: vstrh.32 q1, [r1] 332; CHECK-NEXT: bx lr 333entry: 334 %a = load <8 x i16>, <8 x i16> *%A, align 4 335 %sa = sext <8 x i16> %a to <8 x i32> 336 %add = add <8 x i32> %sa, %sa 337 %sh = ashr <8 x i32> %add, %sa 338 %t = trunc <8 x i32> %sh to <8 x i16> 339 store <8 x i16> %t, <8 x i16> *%D, align 4 340 ret void 341} 342 343define arm_aapcs_vfpcc void @load_one_store_i8(<16 x i8> *%A, <16 x i8> *%D) { 344; CHECK-LABEL: load_one_store_i8: 345; CHECK: @ %bb.0: @ %entry 346; CHECK-NEXT: vldrb.s16 q0, [r0, #8] 347; CHECK-NEXT: vneg.s16 q1, q0 348; CHECK-NEXT: vadd.i16 q0, q0, q0 349; CHECK-NEXT: vshl.s16 q0, q0, q1 350; CHECK-NEXT: vldrb.s16 q1, [r0] 351; CHECK-NEXT: vstrb.16 q0, [r1, #8] 352; CHECK-NEXT: vneg.s16 q2, q1 353; CHECK-NEXT: vadd.i16 q1, q1, q1 354; CHECK-NEXT: vshl.s16 q1, q1, q2 355; CHECK-NEXT: vstrb.16 q1, [r1] 356; CHECK-NEXT: bx lr 357entry: 358 %a = load <16 x i8>, <16 x i8> *%A, align 4 359 %sa = sext <16 x i8> %a to <16 x i16> 360 %add = add <16 x i16> %sa, %sa 361 %sh = ashr <16 x i16> %add, %sa 362 %t = trunc <16 x i16> %sh to <16 x i8> 363 store <16 x i8> %t, <16 x i8> *%D, align 4 364 ret void 365} 366 367 368define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x i32> *%D) { 369; CHECK-LABEL: mul_i32: 370; CHECK: @ %bb.0: @ %entry 371; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 372; CHECK-NEXT: push {r4, r5, r6, r7, lr} 373; CHECK-NEXT: vldrw.u32 q0, [r1] 374; CHECK-NEXT: vldrw.u32 q1, [r0] 375; CHECK-NEXT: ldr.w lr, [sp, #20] 376; CHECK-NEXT: vmov.f32 s8, s0 377; CHECK-NEXT: vmov.f32 s12, s4 378; CHECK-NEXT: vmov.f32 s14, s5 379; CHECK-NEXT: vmov.f32 s10, s1 380; CHECK-NEXT: vmov r5, s12 381; CHECK-NEXT: vmov r1, s14 382; CHECK-NEXT: vmov r0, s10 383; CHECK-NEXT: smull r12, r3, r1, r0 384; CHECK-NEXT: vmov r0, s8 385; CHECK-NEXT: vmov.f32 s8, s2 386; CHECK-NEXT: vmov.f32 s10, s3 387; CHECK-NEXT: vmov.f32 s0, s6 388; CHECK-NEXT: asrl r12, r3, r2 389; CHECK-NEXT: vmov.f32 s2, s7 390; CHECK-NEXT: vmullb.s32 q1, q0, q2 391; CHECK-NEXT: vmov r6, r1, d2 392; CHECK-NEXT: vmov r4, r7, d3 393; CHECK-NEXT: asrl r6, r1, r2 394; CHECK-NEXT: asrl r4, r7, r2 395; CHECK-NEXT: smull r0, r5, r5, r0 396; CHECK-NEXT: asrl r0, r5, r2 397; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 398; CHECK-NEXT: vmov q0[3], q0[1], r12, r4 399; CHECK-NEXT: vstrw.32 q0, [lr] 400; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 401entry: 402 %a = load <4 x i32>, <4 x i32> *%A, align 4 403 %b = load <4 x i32>, <4 x i32> *%B, align 4 404 %i = insertelement <4 x i64> undef, i64 %C, i32 0 405 %c = shufflevector <4 x i64> %i, <4 x i64> undef, <4 x i32> zeroinitializer 406 %sa = sext <4 x i32> %a to <4 x i64> 407 %sb = sext <4 x i32> %b to <4 x i64> 408 %add = mul <4 x i64> %sa, %sb 409 %sh = ashr <4 x i64> %add, %c 410 %t = trunc <4 x i64> %sh to <4 x i32> 411 store <4 x i32> %t, <4 x i32> *%D, align 4 412 ret void 413} 414 415define arm_aapcs_vfpcc void @mul_i16(<8 x i16> *%A, <8 x i16> *%B, i32 %C, <8 x i16> *%D) { 416; CHECK-LABEL: mul_i16: 417; CHECK: @ %bb.0: @ %entry 418; CHECK-NEXT: vldrw.u32 q0, [r1] 419; CHECK-NEXT: vldrw.u32 q1, [r0] 420; CHECK-NEXT: rsbs r2, r2, #0 421; CHECK-NEXT: vmullt.s16 q2, q1, q0 422; CHECK-NEXT: vmullb.s16 q0, q1, q0 423; CHECK-NEXT: vshl.s32 q2, r2 424; CHECK-NEXT: vshl.s32 q0, r2 425; CHECK-NEXT: vmovnt.i32 q0, q2 426; CHECK-NEXT: vstrw.32 q0, [r3] 427; CHECK-NEXT: bx lr 428entry: 429 %a = load <8 x i16>, <8 x i16> *%A, align 4 430 %b = load <8 x i16>, <8 x i16> *%B, align 4 431 %i = insertelement <8 x i32> undef, i32 %C, i32 0 432 %c = shufflevector <8 x i32> %i, <8 x i32> undef, <8 x i32> zeroinitializer 433 %sa = sext <8 x i16> %a to <8 x i32> 434 %sb = sext <8 x i16> %b to <8 x i32> 435 %add = mul <8 x i32> %sa, %sb 436 %sh = ashr <8 x i32> %add, %c 437 %t = trunc <8 x i32> %sh to <8 x i16> 438 store <8 x i16> %t, <8 x i16> *%D, align 4 439 ret void 440} 441 442define arm_aapcs_vfpcc void @mul_i8(<16 x i8> *%A, <16 x i8> *%B, i16 %C, <16 x i8> *%D) { 443; CHECK-LABEL: mul_i8: 444; CHECK: @ %bb.0: @ %entry 445; CHECK-NEXT: vldrw.u32 q0, [r1] 446; CHECK-NEXT: vldrw.u32 q1, [r0] 447; CHECK-NEXT: rsbs r2, r2, #0 448; CHECK-NEXT: vmullt.s8 q2, q1, q0 449; CHECK-NEXT: vmullb.s8 q0, q1, q0 450; CHECK-NEXT: vshl.s16 q2, r2 451; CHECK-NEXT: vshl.s16 q0, r2 452; CHECK-NEXT: vmovnt.i16 q0, q2 453; CHECK-NEXT: vstrw.32 q0, [r3] 454; CHECK-NEXT: bx lr 455entry: 456 %a = load <16 x i8>, <16 x i8> *%A, align 4 457 %b = load <16 x i8>, <16 x i8> *%B, align 4 458 %i = insertelement <16 x i16> undef, i16 %C, i32 0 459 %c = shufflevector <16 x i16> %i, <16 x i16> undef, <16 x i32> zeroinitializer 460 %sa = sext <16 x i8> %a to <16 x i16> 461 %sb = sext <16 x i8> %b to <16 x i16> 462 %add = mul <16 x i16> %sa, %sb 463 %sh = ashr <16 x i16> %add, %c 464 %t = trunc <16 x i16> %sh to <16 x i8> 465 store <16 x i8> %t, <16 x i8> *%D, align 4 466 ret void 467} 468