1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 5; CHECK-LABEL: vqdmulh_v16i8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 8; CHECK-NEXT: vaddv.s8 r0, q0 9; CHECK-NEXT: bx lr 10entry: 11 %l2 = sext <16 x i8> %s0 to <16 x i32> 12 %l5 = sext <16 x i8> %s1 to <16 x i32> 13 %l6 = mul nsw <16 x i32> %l5, %l2 14 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 15 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 16 %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9) 17 ret i32 %l10 18} 19 20define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) { 21; CHECK-LABEL: vqdmulh_v16i8_b: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 24; CHECK-NEXT: bx lr 25entry: 26 %l2 = sext <16 x i8> %s0 to <16 x i32> 27 %l5 = sext <16 x i8> %s1 to <16 x i32> 28 %l6 = mul nsw <16 x i32> %l5, %l2 29 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 30 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 31 %l10 = trunc <16 x i32> %l9 to <16 x i8> 32 ret <16 x i8> %l10 33} 34 35define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { 36; CHECK-LABEL: vqdmulh_v8i8_b: 37; CHECK: @ %bb.0: @ %entry 38; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 39; CHECK-NEXT: vmovlb.s8 q0, q0 40; CHECK-NEXT: bx lr 41entry: 42 %l2 = sext <8 x i8> %s0 to <8 x i32> 43 %l5 = sext <8 x i8> %s1 to <8 x i32> 44 %l6 = mul nsw <8 x i32> %l5, %l2 45 %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 46 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 47 %l10 = trunc <8 x i32> %l9 to <8 x i8> 48 ret <8 x i8> %l10 49} 50 51define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) { 52; CHECK-LABEL: vqdmulh_v4i8_b: 53; CHECK: @ %bb.0: @ %entry 54; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 55; CHECK-NEXT: vmovlb.s8 q0, q0 56; CHECK-NEXT: vmovlb.s16 q0, q0 57; CHECK-NEXT: bx lr 58entry: 59 %l2 = sext <4 x i8> %s0 to <4 x i32> 60 %l5 = sext <4 x i8> %s1 to <4 x i32> 61 %l6 = mul nsw <4 x i32> %l5, %l2 62 %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7> 63 %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) 64 %l10 = trunc <4 x i32> %l9 to <4 x i8> 65 ret <4 x i8> %l10 66} 67 68define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) { 69; CHECK-LABEL: vqdmulh_v32i8_b: 70; CHECK: @ %bb.0: @ %entry 71; CHECK-NEXT: vqdmulh.s8 q0, q2, q0 72; CHECK-NEXT: vqdmulh.s8 q1, q3, q1 73; CHECK-NEXT: bx lr 74entry: 75 %l2 = sext <32 x i8> %s0 to <32 x i32> 76 %l5 = sext <32 x i8> %s1 to <32 x i32> 77 %l6 = mul nsw <32 x i32> %l5, %l2 78 %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 79 %l9 = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 80 %l10 = trunc <32 x i32> %l9 to <32 x i8> 81 ret <32 x i8> %l10 82} 83 84define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 85; CHECK-LABEL: vqdmulh_v8i16: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 88; CHECK-NEXT: vaddv.s16 r0, q0 89; CHECK-NEXT: bx lr 90entry: 91 %l2 = sext <8 x i16> %s0 to <8 x i32> 92 %l5 = sext <8 x i16> %s1 to <8 x i32> 93 %l6 = mul nsw <8 x i32> %l5, %l2 94 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 95 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 96 %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9) 97 ret i32 %l10 98} 99 100define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) { 101; CHECK-LABEL: vqdmulh_v8i16_b: 102; CHECK: @ %bb.0: @ %entry 103; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 104; CHECK-NEXT: bx lr 105entry: 106 %l2 = sext <8 x i16> %s0 to <8 x i32> 107 %l5 = sext <8 x i16> %s1 to <8 x i32> 108 %l6 = mul nsw <8 x i32> %l5, %l2 109 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 110 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 111 %l10 = trunc <8 x i32> %l9 to <8 x i16> 112 ret <8 x i16> %l10 113} 114 115define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) { 116; CHECK-LABEL: vqdmulh_v4i16_b: 117; CHECK: @ %bb.0: @ %entry 118; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 119; CHECK-NEXT: vmovlb.s16 q0, q0 120; CHECK-NEXT: bx lr 121entry: 122 %l2 = sext <4 x i16> %s0 to <4 x i32> 123 %l5 = sext <4 x i16> %s1 to <4 x i32> 124 %l6 = mul nsw <4 x i32> %l5, %l2 125 %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15> 126 %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 127 %l10 = trunc <4 x i32> %l9 to <4 x i16> 128 ret <4 x i16> %l10 129} 130 131define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) { 132; CHECK-LABEL: vqdmulh_v16i16_b: 133; CHECK: @ %bb.0: @ %entry 134; CHECK-NEXT: vqdmulh.s16 q0, q2, q0 135; CHECK-NEXT: vqdmulh.s16 q1, q3, q1 136; CHECK-NEXT: bx lr 137entry: 138 %l2 = sext <16 x i16> %s0 to <16 x i32> 139 %l5 = sext <16 x i16> %s1 to <16 x i32> 140 %l6 = mul nsw <16 x i32> %l5, %l2 141 %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 142 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 143 %l10 = trunc <16 x i32> %l9 to <16 x i16> 144 ret <16 x i16> %l10 145} 146 147define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) { 148; CHECK-LABEL: vqdmulh_v8i16_c: 149; CHECK: @ %bb.0: @ %entry 150; CHECK-NEXT: vmov q2, q0 151; CHECK-NEXT: vmov.u16 r0, q0[2] 152; CHECK-NEXT: vmov.u16 r1, q0[0] 153; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 154; CHECK-NEXT: vmov.u16 r0, q2[3] 155; CHECK-NEXT: vmov.u16 r1, q2[1] 156; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 157; CHECK-NEXT: vmov.u16 r0, q1[2] 158; CHECK-NEXT: vmov.u16 r1, q1[0] 159; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 160; CHECK-NEXT: vmov.u16 r0, q1[3] 161; CHECK-NEXT: vmov.u16 r1, q1[1] 162; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 163; CHECK-NEXT: vmullb.s16 q0, q3, q0 164; CHECK-NEXT: vshl.i32 q0, q0, #10 165; CHECK-NEXT: vshr.s32 q0, q0, #10 166; CHECK-NEXT: vshr.s32 q3, q0, #15 167; CHECK-NEXT: vmov r0, r1, d6 168; CHECK-NEXT: vmov.16 q0[0], r0 169; CHECK-NEXT: vmov.16 q0[1], r1 170; CHECK-NEXT: vmov r0, r1, d7 171; CHECK-NEXT: vmov.16 q0[2], r0 172; CHECK-NEXT: vmov.u16 r0, q2[6] 173; CHECK-NEXT: vmov.16 q0[3], r1 174; CHECK-NEXT: vmov.u16 r1, q2[4] 175; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 176; CHECK-NEXT: vmov.u16 r0, q2[7] 177; CHECK-NEXT: vmov.u16 r1, q2[5] 178; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 179; CHECK-NEXT: vmov.u16 r0, q1[6] 180; CHECK-NEXT: vmov.u16 r1, q1[4] 181; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 182; CHECK-NEXT: vmov.u16 r0, q1[7] 183; CHECK-NEXT: vmov.u16 r1, q1[5] 184; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 185; CHECK-NEXT: vmullb.s16 q1, q2, q3 186; CHECK-NEXT: vshl.i32 q1, q1, #10 187; CHECK-NEXT: vshr.s32 q1, q1, #10 188; CHECK-NEXT: vshr.s32 q1, q1, #15 189; CHECK-NEXT: vmov r0, r1, d2 190; CHECK-NEXT: vmov.16 q0[4], r0 191; CHECK-NEXT: vmov.16 q0[5], r1 192; CHECK-NEXT: vmov r0, r1, d3 193; CHECK-NEXT: vmov.16 q0[6], r0 194; CHECK-NEXT: vmov.16 q0[7], r1 195; CHECK-NEXT: bx lr 196entry: 197 %l2 = sext <8 x i16> %s0 to <8 x i22> 198 %l5 = sext <8 x i16> %s1 to <8 x i22> 199 %l6 = mul nsw <8 x i22> %l5, %l2 200 %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15> 201 %l9 = call <8 x i22> @llvm.smin.v8i22(<8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>) 202 %l10 = trunc <8 x i22> %l9 to <8 x i16> 203 ret <8 x i16> %l10 204} 205 206define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) { 207; CHECK-LABEL: vqdmulh_v8i16_interleaved: 208; CHECK: @ %bb.0: @ %entry 209; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 210; CHECK-NEXT: bx lr 211entry: 212 %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 213 %1 = sext <8 x i16> %0 to <8 x i32> 214 %l2 = sext <8 x i16> %s0 to <8 x i32> 215 %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 216 %3 = sext <8 x i16> %2 to <8 x i32> 217 %l5 = sext <8 x i16> %s1 to <8 x i32> 218 %l6 = mul nsw <8 x i32> %3, %1 219 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 220 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 221 %l10 = trunc <8 x i32> %l9 to <8 x i16> 222 %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 223 ret <8 x i16> %4 224} 225 226define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) { 227; CHECK-LABEL: vqdmulh_v8i16_interleaved2: 228; CHECK: @ %bb.0: 229; CHECK-NEXT: vqdmulh.s16 q2, q1, q0 230; CHECK-NEXT: vrev32.16 q1, q1 231; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 232; CHECK-NEXT: vmovnt.i32 q2, q0 233; CHECK-NEXT: vmov q0, q2 234; CHECK-NEXT: bx lr 235 %s0 = trunc <4 x i32> %s0a to <4 x i16> 236 %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 237 %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 238 %l7 = sext <4 x i16> %strided.vec to <4 x i32> 239 %l8 = sext <4 x i16> %s0 to <4 x i32> 240 %l9 = mul nsw <4 x i32> %l7, %l8 241 %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15> 242 %l12 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 243 %l13 = trunc <4 x i32> %l12 to <4 x i16> 244 %l14 = sext <4 x i16> %strided.vec44 to <4 x i32> 245 %l15 = mul nsw <4 x i32> %l14, %l8 246 %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15> 247 %l18 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 248 %l19 = trunc <4 x i32> %l18 to <4 x i16> 249 %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 250 ret <8 x i16> %interleaved.vec 251} 252 253define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 254; CHECK-LABEL: vqdmulh_v4i32: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 257; CHECK-NEXT: vaddlv.s32 r0, r1, q0 258; CHECK-NEXT: bx lr 259entry: 260 %l2 = sext <4 x i32> %s0 to <4 x i64> 261 %l5 = sext <4 x i32> %s1 to <4 x i64> 262 %l6 = mul nsw <4 x i64> %l5, %l2 263 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 264 %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 265 %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9) 266 ret i64 %l10 267} 268 269define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) { 270; CHECK-LABEL: vqdmulh_v4i32_b: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 273; CHECK-NEXT: bx lr 274entry: 275 %l2 = sext <4 x i32> %s0 to <4 x i64> 276 %l5 = sext <4 x i32> %s1 to <4 x i64> 277 %l6 = mul nsw <4 x i64> %l5, %l2 278 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 279 %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 280 %l10 = trunc <4 x i64> %l9 to <4 x i32> 281 ret <4 x i32> %l10 282} 283 284define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) { 285; CHECK-LABEL: vqdmulh_v2i32_b: 286; CHECK: @ %bb.0: @ %entry 287; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 288; CHECK-NEXT: vmov r0, s2 289; CHECK-NEXT: vmov r1, s0 290; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 291; CHECK-NEXT: asrs r0, r0, #31 292; CHECK-NEXT: asrs r1, r1, #31 293; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 294; CHECK-NEXT: bx lr 295entry: 296 %l2 = sext <2 x i32> %s0 to <2 x i64> 297 %l5 = sext <2 x i32> %s1 to <2 x i64> 298 %l6 = mul nsw <2 x i64> %l5, %l2 299 %l7 = ashr <2 x i64> %l6, <i64 31, i64 31> 300 %l9 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>) 301 %l10 = trunc <2 x i64> %l9 to <2 x i32> 302 ret <2 x i32> %l10 303} 304 305define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) { 306; CHECK-LABEL: vqdmulh_v8i32_b: 307; CHECK: @ %bb.0: @ %entry 308; CHECK-NEXT: vqdmulh.s32 q0, q2, q0 309; CHECK-NEXT: vqdmulh.s32 q1, q3, q1 310; CHECK-NEXT: bx lr 311entry: 312 %l2 = sext <8 x i32> %s0 to <8 x i64> 313 %l5 = sext <8 x i32> %s1 to <8 x i64> 314 %l6 = mul nsw <8 x i64> %l5, %l2 315 %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31> 316 %l9 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 317 %l10 = trunc <8 x i64> %l9 to <8 x i32> 318 ret <8 x i32> %l10 319} 320 321define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) { 322; CHECK-LABEL: vqdmulh_v16i32_b: 323; CHECK: @ %bb.0: @ %entry 324; CHECK-NEXT: .vsave {d8, d9} 325; CHECK-NEXT: vpush {d8, d9} 326; CHECK-NEXT: add r0, sp, #16 327; CHECK-NEXT: vldrw.u32 q4, [r0] 328; CHECK-NEXT: add r0, sp, #32 329; CHECK-NEXT: vqdmulh.s32 q0, q4, q0 330; CHECK-NEXT: vldrw.u32 q4, [r0] 331; CHECK-NEXT: add r0, sp, #48 332; CHECK-NEXT: vqdmulh.s32 q1, q4, q1 333; CHECK-NEXT: vldrw.u32 q4, [r0] 334; CHECK-NEXT: add r0, sp, #64 335; CHECK-NEXT: vqdmulh.s32 q2, q4, q2 336; CHECK-NEXT: vldrw.u32 q4, [r0] 337; CHECK-NEXT: vqdmulh.s32 q3, q4, q3 338; CHECK-NEXT: vpop {d8, d9} 339; CHECK-NEXT: bx lr 340entry: 341 %l2 = sext <16 x i32> %s0 to <16 x i64> 342 %l5 = sext <16 x i32> %s1 to <16 x i64> 343 %l6 = mul nsw <16 x i64> %l5, %l2 344 %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31> 345 %l9 = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 346 %l10 = trunc <16 x i64> %l9 to <16 x i32> 347 ret <16 x i32> %l10 348} 349 350 351 352define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 { 353; CHECK-LABEL: vqdmulh_loop_i8: 354; CHECK: @ %bb.0: @ %entry 355; CHECK-NEXT: .save {r7, lr} 356; CHECK-NEXT: push {r7, lr} 357; CHECK-NEXT: mov.w lr, #64 358; CHECK-NEXT: .LBB17_1: @ %vector.body 359; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 360; CHECK-NEXT: vldrb.u8 q0, [r0], #16 361; CHECK-NEXT: vldrb.u8 q1, [r1], #16 362; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 363; CHECK-NEXT: vstrb.8 q0, [r2], #16 364; CHECK-NEXT: le lr, .LBB17_1 365; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 366; CHECK-NEXT: pop {r7, pc} 367entry: 368 br label %vector.body 369 370vector.body: ; preds = %vector.body, %entry 371 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 372 %0 = getelementptr inbounds i8, i8* %x, i32 %index 373 %1 = bitcast i8* %0 to <16 x i8>* 374 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 375 %2 = sext <16 x i8> %wide.load to <16 x i32> 376 %3 = getelementptr inbounds i8, i8* %y, i32 %index 377 %4 = bitcast i8* %3 to <16 x i8>* 378 %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1 379 %5 = sext <16 x i8> %wide.load26 to <16 x i32> 380 %6 = mul nsw <16 x i32> %5, %2 381 %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 382 %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 383 %9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 384 %10 = trunc <16 x i32> %9 to <16 x i8> 385 %11 = getelementptr inbounds i8, i8* %z, i32 %index 386 %12 = bitcast i8* %11 to <16 x i8>* 387 store <16 x i8> %10, <16 x i8>* %12, align 1 388 %index.next = add i32 %index, 16 389 %13 = icmp eq i32 %index.next, 1024 390 br i1 %13, label %for.cond.cleanup, label %vector.body 391 392for.cond.cleanup: ; preds = %vector.body 393 ret void 394} 395 396define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 397; CHECK-LABEL: vqdmulh_loop_i16: 398; CHECK: @ %bb.0: @ %entry 399; CHECK-NEXT: .save {r7, lr} 400; CHECK-NEXT: push {r7, lr} 401; CHECK-NEXT: mov.w lr, #128 402; CHECK-NEXT: .LBB18_1: @ %vector.body 403; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 404; CHECK-NEXT: vldrh.u16 q0, [r0], #16 405; CHECK-NEXT: vldrh.u16 q1, [r1], #16 406; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 407; CHECK-NEXT: vstrb.8 q0, [r2], #16 408; CHECK-NEXT: le lr, .LBB18_1 409; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 410; CHECK-NEXT: pop {r7, pc} 411entry: 412 br label %vector.body 413 414vector.body: ; preds = %vector.body, %entry 415 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 416 %0 = getelementptr inbounds i16, i16* %x, i32 %index 417 %1 = bitcast i16* %0 to <8 x i16>* 418 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 419 %2 = sext <8 x i16> %wide.load to <8 x i32> 420 %3 = getelementptr inbounds i16, i16* %y, i32 %index 421 %4 = bitcast i16* %3 to <8 x i16>* 422 %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2 423 %5 = sext <8 x i16> %wide.load30 to <8 x i32> 424 %6 = mul nsw <8 x i32> %5, %2 425 %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 426 %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 427 %9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 428 %10 = trunc <8 x i32> %9 to <8 x i16> 429 %11 = getelementptr inbounds i16, i16* %z, i32 %index 430 %12 = bitcast i16* %11 to <8 x i16>* 431 store <8 x i16> %10, <8 x i16>* %12, align 2 432 %index.next = add i32 %index, 8 433 %13 = icmp eq i32 %index.next, 1024 434 br i1 %13, label %for.cond.cleanup, label %vector.body 435 436for.cond.cleanup: ; preds = %vector.body 437 ret void 438} 439 440define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 441; CHECK-LABEL: vqdmulh_loop_i32: 442; CHECK: @ %bb.0: @ %entry 443; CHECK-NEXT: .save {r7, lr} 444; CHECK-NEXT: push {r7, lr} 445; CHECK-NEXT: mov.w lr, #256 446; CHECK-NEXT: .LBB19_1: @ %vector.body 447; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 448; CHECK-NEXT: vldrw.u32 q0, [r0], #16 449; CHECK-NEXT: vldrw.u32 q1, [r1], #16 450; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 451; CHECK-NEXT: vstrb.8 q0, [r2], #16 452; CHECK-NEXT: le lr, .LBB19_1 453; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 454; CHECK-NEXT: pop {r7, pc} 455entry: 456 br label %vector.body 457 458vector.body: ; preds = %vector.body, %entry 459 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 460 %0 = getelementptr inbounds i32, i32* %x, i32 %index 461 %1 = bitcast i32* %0 to <4 x i32>* 462 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 463 %2 = sext <4 x i32> %wide.load to <4 x i64> 464 %3 = getelementptr inbounds i32, i32* %y, i32 %index 465 %4 = bitcast i32* %3 to <4 x i32>* 466 %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4 467 %5 = sext <4 x i32> %wide.load30 to <4 x i64> 468 %6 = mul nsw <4 x i64> %5, %2 469 %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31> 470 %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 471 %9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 472 %10 = trunc <4 x i64> %9 to <4 x i32> 473 %11 = getelementptr inbounds i32, i32* %z, i32 %index 474 %12 = bitcast i32* %11 to <4 x i32>* 475 store <4 x i32> %10, <4 x i32>* %12, align 4 476 %index.next = add i32 %index, 4 477 %13 = icmp eq i32 %index.next, 1024 478 br i1 %13, label %for.cond.cleanup, label %vector.body 479 480for.cond.cleanup: ; preds = %vector.body 481 ret void 482} 483 484define i32 @scalar(i16 %a) { 485; CHECK-LABEL: scalar: 486; CHECK: @ %bb.0: 487; CHECK-NEXT: smulbb r1, r0, r0 488; CHECK-NEXT: movs r0, #127 489; CHECK-NEXT: asrs r2, r1, #7 490; CHECK-NEXT: cmp r2, #127 491; CHECK-NEXT: it lt 492; CHECK-NEXT: asrlt r0, r1, #7 493; CHECK-NEXT: bx lr 494 %e = sext i16 %a to i32 495 %d = mul nsw i32 %e, %e 496 %b = ashr i32 %d, 7 497 %c = call i32 @llvm.smin.i32(i32 %b, i32 127) 498 ret i32 %c 499} 500 501declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 502declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 503declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 504declare i32 @llvm.smin.i32(i32 %a, i32 %b) 505declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) 506declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) 507declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) 508declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>) 509declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) 510declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 511declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) 512declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>) 513declare <8 x i22> @llvm.smin.v8i22(<8 x i22>, <8 x i22>) 514