1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) { 5; CHECK-LABEL: vmulhs_v2i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmullb.s32 q2, q0, q1 8; CHECK-NEXT: vmov r0, s11 9; CHECK-NEXT: vmov r1, s9 10; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 11; CHECK-NEXT: asrs r0, r0, #31 12; CHECK-NEXT: asrs r1, r1, #31 13; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 14; CHECK-NEXT: bx lr 15entry: 16 %s0s = sext <2 x i32> %s0 to <2 x i64> 17 %s1s = sext <2 x i32> %s1 to <2 x i64> 18 %m = mul <2 x i64> %s0s, %s1s 19 %s = ashr <2 x i64> %m, <i64 32, i64 32> 20 %s2 = trunc <2 x i64> %s to <2 x i32> 21 ret <2 x i32> %s2 22} 23 24define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) { 25; CHECK-LABEL: vmulhu_v2i32: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vmullb.u32 q2, q0, q1 28; CHECK-NEXT: vldr s1, .LCPI1_0 29; CHECK-NEXT: vmov.f32 s0, s9 30; CHECK-NEXT: vmov.f32 s2, s11 31; CHECK-NEXT: vmov.f32 s3, s1 32; CHECK-NEXT: bx lr 33; CHECK-NEXT: .p2align 2 34; CHECK-NEXT: @ %bb.1: 35; CHECK-NEXT: .LCPI1_0: 36; CHECK-NEXT: .long 0x00000000 @ float 0 37entry: 38 %s0s = zext <2 x i32> %s0 to <2 x i64> 39 %s1s = zext <2 x i32> %s1 to <2 x i64> 40 %m = mul <2 x i64> %s0s, %s1s 41 %s = lshr <2 x i64> %m, <i64 32, i64 32> 42 %s2 = trunc <2 x i64> %s to <2 x i32> 43 ret <2 x i32> %s2 44} 45 46define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 47; CHECK-LABEL: vmulhs_v4i32: 48; CHECK: @ %bb.0: @ %entry 49; CHECK-NEXT: vmulh.s32 q0, q0, q1 50; CHECK-NEXT: bx lr 51entry: 52 %s0s = sext <4 x i32> %s0 to <4 x i64> 53 %s1s = sext <4 x i32> %s1 to <4 x i64> 54 %m = mul <4 x i64> %s0s, %s1s 55 %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32> 56 %s2 = trunc <4 x i64> %s to <4 x i32> 57 ret <4 x i32> %s2 58} 59 60define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 61; CHECK-LABEL: vmulhu_v4i32: 62; CHECK: @ %bb.0: @ %entry 63; CHECK-NEXT: vmulh.u32 q0, q0, q1 64; CHECK-NEXT: bx lr 65entry: 66 %s0s = zext <4 x i32> %s0 to <4 x i64> 67 %s1s = zext <4 x i32> %s1 to <4 x i64> 68 %m = mul <4 x i64> %s0s, %s1s 69 %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32> 70 %s2 = trunc <4 x i64> %s to <4 x i32> 71 ret <4 x i32> %s2 72} 73 74define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 75; CHECK-LABEL: vmulhs_v4i16: 76; CHECK: @ %bb.0: @ %entry 77; CHECK-NEXT: vmullb.s16 q0, q0, q1 78; CHECK-NEXT: vshr.s32 q0, q0, #16 79; CHECK-NEXT: bx lr 80entry: 81 %s0s = sext <4 x i16> %s0 to <4 x i32> 82 %s1s = sext <4 x i16> %s1 to <4 x i32> 83 %m = mul <4 x i32> %s0s, %s1s 84 %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16> 85 %s2 = trunc <4 x i32> %s to <4 x i16> 86 ret <4 x i16> %s2 87} 88 89define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 90; CHECK-LABEL: vmulhu_v4i16: 91; CHECK: @ %bb.0: @ %entry 92; CHECK-NEXT: vmullb.u16 q0, q0, q1 93; CHECK-NEXT: vshr.u32 q0, q0, #16 94; CHECK-NEXT: bx lr 95entry: 96 %s0s = zext <4 x i16> %s0 to <4 x i32> 97 %s1s = zext <4 x i16> %s1 to <4 x i32> 98 %m = mul <4 x i32> %s0s, %s1s 99 %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16> 100 %s2 = trunc <4 x i32> %s to <4 x i16> 101 ret <4 x i16> %s2 102} 103 104define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 105; CHECK-LABEL: vmulhs_v8i16: 106; CHECK: @ %bb.0: @ %entry 107; CHECK-NEXT: vmulh.s16 q0, q0, q1 108; CHECK-NEXT: bx lr 109entry: 110 %s0s = sext <8 x i16> %s0 to <8 x i32> 111 %s1s = sext <8 x i16> %s1 to <8 x i32> 112 %m = mul <8 x i32> %s0s, %s1s 113 %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 114 %s2 = trunc <8 x i32> %s to <8 x i16> 115 ret <8 x i16> %s2 116} 117 118define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 119; CHECK-LABEL: vmulhu_v8i16: 120; CHECK: @ %bb.0: @ %entry 121; CHECK-NEXT: vmulh.u16 q0, q0, q1 122; CHECK-NEXT: bx lr 123entry: 124 %s0s = zext <8 x i16> %s0 to <8 x i32> 125 %s1s = zext <8 x i16> %s1 to <8 x i32> 126 %m = mul <8 x i32> %s0s, %s1s 127 %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 128 %s2 = trunc <8 x i32> %s to <8 x i16> 129 ret <8 x i16> %s2 130} 131 132define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 133; CHECK-LABEL: vmulhs_v8i8: 134; CHECK: @ %bb.0: @ %entry 135; CHECK-NEXT: vmullb.s8 q0, q0, q1 136; CHECK-NEXT: vshr.s16 q0, q0, #8 137; CHECK-NEXT: bx lr 138entry: 139 %s0s = sext <8 x i8> %s0 to <8 x i16> 140 %s1s = sext <8 x i8> %s1 to <8 x i16> 141 %m = mul <8 x i16> %s0s, %s1s 142 %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 143 %s2 = trunc <8 x i16> %s to <8 x i8> 144 ret <8 x i8> %s2 145} 146 147define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 148; CHECK-LABEL: vmulhu_v8i8: 149; CHECK: @ %bb.0: @ %entry 150; CHECK-NEXT: vmullb.u8 q0, q0, q1 151; CHECK-NEXT: vshr.u16 q0, q0, #8 152; CHECK-NEXT: bx lr 153entry: 154 %s0s = zext <8 x i8> %s0 to <8 x i16> 155 %s1s = zext <8 x i8> %s1 to <8 x i16> 156 %m = mul <8 x i16> %s0s, %s1s 157 %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 158 %s2 = trunc <8 x i16> %s to <8 x i8> 159 ret <8 x i8> %s2 160} 161 162define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 163; CHECK-LABEL: vmulhs_v16i8: 164; CHECK: @ %bb.0: @ %entry 165; CHECK-NEXT: vmulh.s8 q0, q0, q1 166; CHECK-NEXT: bx lr 167entry: 168 %s0s = sext <16 x i8> %s0 to <16 x i16> 169 %s1s = sext <16 x i8> %s1 to <16 x i16> 170 %m = mul <16 x i16> %s0s, %s1s 171 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 172 %s2 = trunc <16 x i16> %s to <16 x i8> 173 ret <16 x i8> %s2 174} 175 176define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 177; CHECK-LABEL: vmulhu_v16i8: 178; CHECK: @ %bb.0: @ %entry 179; CHECK-NEXT: vmulh.u8 q0, q0, q1 180; CHECK-NEXT: bx lr 181entry: 182 %s0s = zext <16 x i8> %s0 to <16 x i16> 183 %s1s = zext <16 x i8> %s1 to <16 x i16> 184 %m = mul <16 x i16> %s0s, %s1s 185 %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 186 %s2 = trunc <16 x i16> %s to <16 x i8> 187 ret <16 x i8> %s2 188} 189 190define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { 191; CHECK-LABEL: vmulh_s8: 192; CHECK: @ %bb.0: @ %entry 193; CHECK-NEXT: .save {r7, lr} 194; CHECK-NEXT: push {r7, lr} 195; CHECK-NEXT: mov.w lr, #64 196; CHECK-NEXT: .LBB12_1: @ %vector.body 197; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 198; CHECK-NEXT: vldrb.u8 q0, [r0], #16 199; CHECK-NEXT: vldrb.u8 q1, [r1], #16 200; CHECK-NEXT: vmulh.s8 q0, q1, q0 201; CHECK-NEXT: vstrb.8 q0, [r2], #16 202; CHECK-NEXT: le lr, .LBB12_1 203; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 204; CHECK-NEXT: pop {r7, pc} 205entry: 206 br label %vector.body 207 208vector.body: ; preds = %vector.body, %entry 209 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 210 %0 = getelementptr inbounds i8, i8* %x, i32 %index 211 %1 = bitcast i8* %0 to <16 x i8>* 212 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 213 %2 = sext <16 x i8> %wide.load to <16 x i16> 214 %3 = getelementptr inbounds i8, i8* %y, i32 %index 215 %4 = bitcast i8* %3 to <16 x i8>* 216 %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1 217 %5 = sext <16 x i8> %wide.load17 to <16 x i16> 218 %6 = mul nsw <16 x i16> %5, %2 219 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 220 %8 = trunc <16 x i16> %7 to <16 x i8> 221 %9 = getelementptr inbounds i8, i8* %z, i32 %index 222 %10 = bitcast i8* %9 to <16 x i8>* 223 store <16 x i8> %8, <16 x i8>* %10, align 1 224 %index.next = add i32 %index, 16 225 %11 = icmp eq i32 %index.next, 1024 226 br i1 %11, label %for.cond.cleanup, label %vector.body 227 228for.cond.cleanup: ; preds = %vector.body 229 ret void 230} 231 232define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 233; CHECK-LABEL: vmulh_s16: 234; CHECK: @ %bb.0: @ %entry 235; CHECK-NEXT: .save {r7, lr} 236; CHECK-NEXT: push {r7, lr} 237; CHECK-NEXT: mov.w lr, #128 238; CHECK-NEXT: .LBB13_1: @ %vector.body 239; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 240; CHECK-NEXT: vldrh.u16 q0, [r0], #16 241; CHECK-NEXT: vldrh.u16 q1, [r1], #16 242; CHECK-NEXT: vmulh.s16 q0, q1, q0 243; CHECK-NEXT: vstrb.8 q0, [r2], #16 244; CHECK-NEXT: le lr, .LBB13_1 245; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 246; CHECK-NEXT: pop {r7, pc} 247entry: 248 br label %vector.body 249 250vector.body: ; preds = %vector.body, %entry 251 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 252 %0 = getelementptr inbounds i16, i16* %x, i32 %index 253 %1 = bitcast i16* %0 to <8 x i16>* 254 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 255 %2 = sext <8 x i16> %wide.load to <8 x i32> 256 %3 = getelementptr inbounds i16, i16* %y, i32 %index 257 %4 = bitcast i16* %3 to <8 x i16>* 258 %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2 259 %5 = sext <8 x i16> %wide.load17 to <8 x i32> 260 %6 = mul nsw <8 x i32> %5, %2 261 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 262 %8 = trunc <8 x i32> %7 to <8 x i16> 263 %9 = getelementptr inbounds i16, i16* %z, i32 %index 264 %10 = bitcast i16* %9 to <8 x i16>* 265 store <8 x i16> %8, <8 x i16>* %10, align 2 266 %index.next = add i32 %index, 8 267 %11 = icmp eq i32 %index.next, 1024 268 br i1 %11, label %for.cond.cleanup, label %vector.body 269 270for.cond.cleanup: ; preds = %vector.body 271 ret void 272} 273 274define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 275; CHECK-LABEL: vmulh_s32: 276; CHECK: @ %bb.0: @ %entry 277; CHECK-NEXT: .save {r7, lr} 278; CHECK-NEXT: push {r7, lr} 279; CHECK-NEXT: mov.w lr, #256 280; CHECK-NEXT: .LBB14_1: @ %vector.body 281; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 282; CHECK-NEXT: vldrw.u32 q0, [r0], #16 283; CHECK-NEXT: vldrw.u32 q1, [r1], #16 284; CHECK-NEXT: vmulh.s32 q0, q1, q0 285; CHECK-NEXT: vstrb.8 q0, [r2], #16 286; CHECK-NEXT: le lr, .LBB14_1 287; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 288; CHECK-NEXT: pop {r7, pc} 289entry: 290 br label %vector.body 291 292vector.body: ; preds = %vector.body, %entry 293 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 294 %0 = getelementptr inbounds i32, i32* %x, i32 %index 295 %1 = bitcast i32* %0 to <4 x i32>* 296 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 297 %2 = sext <4 x i32> %wide.load to <4 x i64> 298 %3 = getelementptr inbounds i32, i32* %y, i32 %index 299 %4 = bitcast i32* %3 to <4 x i32>* 300 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4 301 %5 = sext <4 x i32> %wide.load17 to <4 x i64> 302 %6 = mul nsw <4 x i64> %5, %2 303 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 304 %8 = trunc <4 x i64> %7 to <4 x i32> 305 %9 = getelementptr inbounds i32, i32* %z, i32 %index 306 %10 = bitcast i32* %9 to <4 x i32>* 307 store <4 x i32> %8, <4 x i32>* %10, align 4 308 %index.next = add i32 %index, 4 309 %11 = icmp eq i32 %index.next, 1024 310 br i1 %11, label %for.cond.cleanup, label %vector.body 311 312for.cond.cleanup: ; preds = %vector.body 313 ret void 314} 315 316define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { 317; CHECK-LABEL: vmulh_u8: 318; CHECK: @ %bb.0: @ %entry 319; CHECK-NEXT: .save {r7, lr} 320; CHECK-NEXT: push {r7, lr} 321; CHECK-NEXT: mov.w lr, #64 322; CHECK-NEXT: .LBB15_1: @ %vector.body 323; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 324; CHECK-NEXT: vldrb.u8 q0, [r0], #16 325; CHECK-NEXT: vldrb.u8 q1, [r1], #16 326; CHECK-NEXT: vmulh.u8 q0, q1, q0 327; CHECK-NEXT: vstrb.8 q0, [r2], #16 328; CHECK-NEXT: le lr, .LBB15_1 329; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 330; CHECK-NEXT: pop {r7, pc} 331entry: 332 br label %vector.body 333 334vector.body: ; preds = %vector.body, %entry 335 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 336 %0 = getelementptr inbounds i8, i8* %x, i32 %index 337 %1 = bitcast i8* %0 to <16 x i8>* 338 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 339 %2 = zext <16 x i8> %wide.load to <16 x i16> 340 %3 = getelementptr inbounds i8, i8* %y, i32 %index 341 %4 = bitcast i8* %3 to <16 x i8>* 342 %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1 343 %5 = zext <16 x i8> %wide.load17 to <16 x i16> 344 %6 = mul nuw <16 x i16> %5, %2 345 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 346 %8 = trunc <16 x i16> %7 to <16 x i8> 347 %9 = getelementptr inbounds i8, i8* %z, i32 %index 348 %10 = bitcast i8* %9 to <16 x i8>* 349 store <16 x i8> %8, <16 x i8>* %10, align 1 350 %index.next = add i32 %index, 16 351 %11 = icmp eq i32 %index.next, 1024 352 br i1 %11, label %for.cond.cleanup, label %vector.body 353 354for.cond.cleanup: ; preds = %vector.body 355 ret void 356} 357 358define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 359; CHECK-LABEL: vmulh_u16: 360; CHECK: @ %bb.0: @ %entry 361; CHECK-NEXT: .save {r7, lr} 362; CHECK-NEXT: push {r7, lr} 363; CHECK-NEXT: mov.w lr, #128 364; CHECK-NEXT: .LBB16_1: @ %vector.body 365; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 366; CHECK-NEXT: vldrh.u16 q0, [r0], #16 367; CHECK-NEXT: vldrh.u16 q1, [r1], #16 368; CHECK-NEXT: vmulh.u16 q0, q1, q0 369; CHECK-NEXT: vstrb.8 q0, [r2], #16 370; CHECK-NEXT: le lr, .LBB16_1 371; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 372; CHECK-NEXT: pop {r7, pc} 373entry: 374 br label %vector.body 375 376vector.body: ; preds = %vector.body, %entry 377 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 378 %0 = getelementptr inbounds i16, i16* %x, i32 %index 379 %1 = bitcast i16* %0 to <8 x i16>* 380 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 381 %2 = zext <8 x i16> %wide.load to <8 x i32> 382 %3 = getelementptr inbounds i16, i16* %y, i32 %index 383 %4 = bitcast i16* %3 to <8 x i16>* 384 %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2 385 %5 = zext <8 x i16> %wide.load17 to <8 x i32> 386 %6 = mul nuw <8 x i32> %5, %2 387 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 388 %8 = trunc <8 x i32> %7 to <8 x i16> 389 %9 = getelementptr inbounds i16, i16* %z, i32 %index 390 %10 = bitcast i16* %9 to <8 x i16>* 391 store <8 x i16> %8, <8 x i16>* %10, align 2 392 %index.next = add i32 %index, 8 393 %11 = icmp eq i32 %index.next, 1024 394 br i1 %11, label %for.cond.cleanup, label %vector.body 395 396for.cond.cleanup: ; preds = %vector.body 397 ret void 398} 399 400define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 401; CHECK-LABEL: vmulh_u32: 402; CHECK: @ %bb.0: @ %entry 403; CHECK-NEXT: .save {r7, lr} 404; CHECK-NEXT: push {r7, lr} 405; CHECK-NEXT: mov.w lr, #256 406; CHECK-NEXT: .LBB17_1: @ %vector.body 407; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 408; CHECK-NEXT: vldrw.u32 q0, [r0], #16 409; CHECK-NEXT: vldrw.u32 q1, [r1], #16 410; CHECK-NEXT: vmulh.u32 q0, q1, q0 411; CHECK-NEXT: vstrb.8 q0, [r2], #16 412; CHECK-NEXT: le lr, .LBB17_1 413; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 414; CHECK-NEXT: pop {r7, pc} 415entry: 416 br label %vector.body 417 418vector.body: ; preds = %vector.body, %entry 419 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 420 %0 = getelementptr inbounds i32, i32* %x, i32 %index 421 %1 = bitcast i32* %0 to <4 x i32>* 422 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 423 %2 = zext <4 x i32> %wide.load to <4 x i64> 424 %3 = getelementptr inbounds i32, i32* %y, i32 %index 425 %4 = bitcast i32* %3 to <4 x i32>* 426 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4 427 %5 = zext <4 x i32> %wide.load17 to <4 x i64> 428 %6 = mul nuw <4 x i64> %5, %2 429 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 430 %8 = trunc <4 x i64> %7 to <4 x i32> 431 %9 = getelementptr inbounds i32, i32* %z, i32 %index 432 %10 = bitcast i32* %9 to <4 x i32>* 433 store <4 x i32> %8, <4 x i32>* %10, align 4 434 %index.next = add i32 %index, 4 435 %11 = icmp eq i32 %index.next, 1024 436 br i1 %11, label %for.cond.cleanup, label %vector.body 437 438for.cond.cleanup: ; preds = %vector.body 439 ret void 440} 441 442 443define void @vmulh_s32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) { 444; CHECK-LABEL: vmulh_s32_pred: 445; CHECK: @ %bb.0: @ %entry 446; CHECK-NEXT: .save {r7, lr} 447; CHECK-NEXT: push {r7, lr} 448; CHECK-NEXT: cmp r3, #1 449; CHECK-NEXT: it lt 450; CHECK-NEXT: poplt {r7, pc} 451; CHECK-NEXT: .LBB18_1: @ %vector.ph 452; CHECK-NEXT: dlstp.32 lr, r3 453; CHECK-NEXT: .LBB18_2: @ %vector.body 454; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 455; CHECK-NEXT: vldrw.u32 q0, [r1], #16 456; CHECK-NEXT: vldrw.u32 q1, [r2], #16 457; CHECK-NEXT: vmulh.s32 q0, q1, q0 458; CHECK-NEXT: vstrw.32 q0, [r0], #16 459; CHECK-NEXT: letp lr, .LBB18_2 460; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 461; CHECK-NEXT: pop {r7, pc} 462entry: 463 %cmp10 = icmp sgt i32 %n, 0 464 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 465 466vector.ph: ; preds = %entry 467 %n.rnd.up = add i32 %n, 3 468 %n.vec = and i32 %n.rnd.up, -4 469 br label %vector.body 470 471vector.body: ; preds = %vector.body, %vector.ph 472 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 473 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 474 %0 = getelementptr inbounds i32, i32* %x, i32 %index 475 %1 = bitcast i32* %0 to <4 x i32>* 476 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 477 %2 = sext <4 x i32> %wide.masked.load to <4 x i64> 478 %3 = getelementptr inbounds i32, i32* %y, i32 %index 479 %4 = bitcast i32* %3 to <4 x i32>* 480 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 481 %5 = sext <4 x i32> %wide.masked.load12 to <4 x i64> 482 %6 = mul nsw <4 x i64> %5, %2 483 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 484 %8 = trunc <4 x i64> %7 to <4 x i32> 485 %9 = getelementptr inbounds i32, i32* %d, i32 %index 486 %10 = bitcast i32* %9 to <4 x i32>* 487 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask) 488 %index.next = add i32 %index, 4 489 %11 = icmp eq i32 %index.next, %n.vec 490 br i1 %11, label %for.cond.cleanup, label %vector.body 491 492for.cond.cleanup: ; preds = %vector.body, %entry 493 ret void 494} 495 496define void @vmulh_u32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) { 497; CHECK-LABEL: vmulh_u32_pred: 498; CHECK: @ %bb.0: @ %entry 499; CHECK-NEXT: .save {r7, lr} 500; CHECK-NEXT: push {r7, lr} 501; CHECK-NEXT: cmp r3, #1 502; CHECK-NEXT: it lt 503; CHECK-NEXT: poplt {r7, pc} 504; CHECK-NEXT: .LBB19_1: @ %vector.ph 505; CHECK-NEXT: dlstp.32 lr, r3 506; CHECK-NEXT: .LBB19_2: @ %vector.body 507; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 508; CHECK-NEXT: vldrw.u32 q0, [r1], #16 509; CHECK-NEXT: vldrw.u32 q1, [r2], #16 510; CHECK-NEXT: vmulh.u32 q0, q1, q0 511; CHECK-NEXT: vstrw.32 q0, [r0], #16 512; CHECK-NEXT: letp lr, .LBB19_2 513; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 514; CHECK-NEXT: pop {r7, pc} 515entry: 516 %cmp10 = icmp sgt i32 %n, 0 517 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 518 519vector.ph: ; preds = %entry 520 %n.rnd.up = add i32 %n, 3 521 %n.vec = and i32 %n.rnd.up, -4 522 br label %vector.body 523 524vector.body: ; preds = %vector.body, %vector.ph 525 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 526 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 527 %0 = getelementptr inbounds i32, i32* %x, i32 %index 528 %1 = bitcast i32* %0 to <4 x i32>* 529 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 530 %2 = zext <4 x i32> %wide.masked.load to <4 x i64> 531 %3 = getelementptr inbounds i32, i32* %y, i32 %index 532 %4 = bitcast i32* %3 to <4 x i32>* 533 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 534 %5 = zext <4 x i32> %wide.masked.load12 to <4 x i64> 535 %6 = mul nuw <4 x i64> %5, %2 536 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 537 %8 = trunc <4 x i64> %7 to <4 x i32> 538 %9 = getelementptr inbounds i32, i32* %d, i32 %index 539 %10 = bitcast i32* %9 to <4 x i32>* 540 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask) 541 %index.next = add i32 %index, 4 542 %11 = icmp eq i32 %index.next, %n.vec 543 br i1 %11, label %for.cond.cleanup, label %vector.body 544 545for.cond.cleanup: ; preds = %vector.body, %entry 546 ret void 547} 548 549define void @vmulh_s16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) { 550; CHECK-LABEL: vmulh_s16_pred: 551; CHECK: @ %bb.0: @ %entry 552; CHECK-NEXT: .save {r7, lr} 553; CHECK-NEXT: push {r7, lr} 554; CHECK-NEXT: cmp r3, #1 555; CHECK-NEXT: it lt 556; CHECK-NEXT: poplt {r7, pc} 557; CHECK-NEXT: .LBB20_1: @ %vector.ph 558; CHECK-NEXT: dlstp.16 lr, r3 559; CHECK-NEXT: .LBB20_2: @ %vector.body 560; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 561; CHECK-NEXT: vldrh.u16 q0, [r1], #16 562; CHECK-NEXT: vldrh.u16 q1, [r2], #16 563; CHECK-NEXT: vmulh.s16 q0, q1, q0 564; CHECK-NEXT: vstrh.16 q0, [r0], #16 565; CHECK-NEXT: letp lr, .LBB20_2 566; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 567; CHECK-NEXT: pop {r7, pc} 568entry: 569 %cmp10 = icmp sgt i32 %n, 0 570 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 571 572vector.ph: ; preds = %entry 573 %n.rnd.up = add i32 %n, 7 574 %n.vec = and i32 %n.rnd.up, -8 575 br label %vector.body 576 577vector.body: ; preds = %vector.body, %vector.ph 578 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 579 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 580 %0 = getelementptr inbounds i16, i16* %x, i32 %index 581 %1 = bitcast i16* %0 to <8 x i16>* 582 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 583 %2 = sext <8 x i16> %wide.masked.load to <8 x i32> 584 %3 = getelementptr inbounds i16, i16* %y, i32 %index 585 %4 = bitcast i16* %3 to <8 x i16>* 586 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 587 %5 = sext <8 x i16> %wide.masked.load12 to <8 x i32> 588 %6 = mul nsw <8 x i32> %5, %2 589 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 590 %8 = trunc <8 x i32> %7 to <8 x i16> 591 %9 = getelementptr inbounds i16, i16* %d, i32 %index 592 %10 = bitcast i16* %9 to <8 x i16>* 593 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask) 594 %index.next = add i32 %index, 8 595 %11 = icmp eq i32 %index.next, %n.vec 596 br i1 %11, label %for.cond.cleanup, label %vector.body 597 598for.cond.cleanup: ; preds = %vector.body, %entry 599 ret void 600} 601 602define void @vmulh_u16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) { 603; CHECK-LABEL: vmulh_u16_pred: 604; CHECK: @ %bb.0: @ %entry 605; CHECK-NEXT: .save {r7, lr} 606; CHECK-NEXT: push {r7, lr} 607; CHECK-NEXT: cmp r3, #1 608; CHECK-NEXT: it lt 609; CHECK-NEXT: poplt {r7, pc} 610; CHECK-NEXT: .LBB21_1: @ %vector.ph 611; CHECK-NEXT: dlstp.16 lr, r3 612; CHECK-NEXT: .LBB21_2: @ %vector.body 613; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 614; CHECK-NEXT: vldrh.u16 q0, [r1], #16 615; CHECK-NEXT: vldrh.u16 q1, [r2], #16 616; CHECK-NEXT: vmulh.u16 q0, q1, q0 617; CHECK-NEXT: vstrh.16 q0, [r0], #16 618; CHECK-NEXT: letp lr, .LBB21_2 619; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 620; CHECK-NEXT: pop {r7, pc} 621entry: 622 %cmp10 = icmp sgt i32 %n, 0 623 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 624 625vector.ph: ; preds = %entry 626 %n.rnd.up = add i32 %n, 7 627 %n.vec = and i32 %n.rnd.up, -8 628 br label %vector.body 629 630vector.body: ; preds = %vector.body, %vector.ph 631 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 632 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 633 %0 = getelementptr inbounds i16, i16* %x, i32 %index 634 %1 = bitcast i16* %0 to <8 x i16>* 635 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 636 %2 = zext <8 x i16> %wide.masked.load to <8 x i32> 637 %3 = getelementptr inbounds i16, i16* %y, i32 %index 638 %4 = bitcast i16* %3 to <8 x i16>* 639 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 640 %5 = zext <8 x i16> %wide.masked.load12 to <8 x i32> 641 %6 = mul nuw <8 x i32> %5, %2 642 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 643 %8 = trunc <8 x i32> %7 to <8 x i16> 644 %9 = getelementptr inbounds i16, i16* %d, i32 %index 645 %10 = bitcast i16* %9 to <8 x i16>* 646 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask) 647 %index.next = add i32 %index, 8 648 %11 = icmp eq i32 %index.next, %n.vec 649 br i1 %11, label %for.cond.cleanup, label %vector.body 650 651for.cond.cleanup: ; preds = %vector.body, %entry 652 ret void 653} 654 655define void @vmulh_s8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) { 656; CHECK-LABEL: vmulh_s8_pred: 657; CHECK: @ %bb.0: @ %entry 658; CHECK-NEXT: .save {r7, lr} 659; CHECK-NEXT: push {r7, lr} 660; CHECK-NEXT: cmp r3, #1 661; CHECK-NEXT: it lt 662; CHECK-NEXT: poplt {r7, pc} 663; CHECK-NEXT: .LBB22_1: @ %vector.ph 664; CHECK-NEXT: dlstp.8 lr, r3 665; CHECK-NEXT: .LBB22_2: @ %vector.body 666; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 667; CHECK-NEXT: vldrb.u8 q0, [r1], #16 668; CHECK-NEXT: vldrb.u8 q1, [r2], #16 669; CHECK-NEXT: vmulh.s8 q0, q1, q0 670; CHECK-NEXT: vstrb.8 q0, [r0], #16 671; CHECK-NEXT: letp lr, .LBB22_2 672; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 673; CHECK-NEXT: pop {r7, pc} 674entry: 675 %cmp10 = icmp sgt i32 %n, 0 676 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 677 678vector.ph: ; preds = %entry 679 %n.rnd.up = add i32 %n, 15 680 %n.vec = and i32 %n.rnd.up, -16 681 br label %vector.body 682 683vector.body: ; preds = %vector.body, %vector.ph 684 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 685 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 686 %0 = getelementptr inbounds i8, i8* %x, i32 %index 687 %1 = bitcast i8* %0 to <16 x i8>* 688 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 689 %2 = sext <16 x i8> %wide.masked.load to <16 x i16> 690 %3 = getelementptr inbounds i8, i8* %y, i32 %index 691 %4 = bitcast i8* %3 to <16 x i8>* 692 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 693 %5 = sext <16 x i8> %wide.masked.load12 to <16 x i16> 694 %6 = mul nsw <16 x i16> %5, %2 695 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 696 %8 = trunc <16 x i16> %7 to <16 x i8> 697 %9 = getelementptr inbounds i8, i8* %d, i32 %index 698 %10 = bitcast i8* %9 to <16 x i8>* 699 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask) 700 %index.next = add i32 %index, 16 701 %11 = icmp eq i32 %index.next, %n.vec 702 br i1 %11, label %for.cond.cleanup, label %vector.body 703 704for.cond.cleanup: ; preds = %vector.body, %entry 705 ret void 706} 707 708define void @vmulh_u8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) { 709; CHECK-LABEL: vmulh_u8_pred: 710; CHECK: @ %bb.0: @ %entry 711; CHECK-NEXT: .save {r7, lr} 712; CHECK-NEXT: push {r7, lr} 713; CHECK-NEXT: cmp r3, #1 714; CHECK-NEXT: it lt 715; CHECK-NEXT: poplt {r7, pc} 716; CHECK-NEXT: .LBB23_1: @ %vector.ph 717; CHECK-NEXT: dlstp.8 lr, r3 718; CHECK-NEXT: .LBB23_2: @ %vector.body 719; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 720; CHECK-NEXT: vldrb.u8 q0, [r1], #16 721; CHECK-NEXT: vldrb.u8 q1, [r2], #16 722; CHECK-NEXT: vmulh.u8 q0, q1, q0 723; CHECK-NEXT: vstrb.8 q0, [r0], #16 724; CHECK-NEXT: letp lr, .LBB23_2 725; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 726; CHECK-NEXT: pop {r7, pc} 727entry: 728 %cmp10 = icmp sgt i32 %n, 0 729 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 730 731vector.ph: ; preds = %entry 732 %n.rnd.up = add i32 %n, 15 733 %n.vec = and i32 %n.rnd.up, -16 734 br label %vector.body 735 736vector.body: ; preds = %vector.body, %vector.ph 737 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 738 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 739 %0 = getelementptr inbounds i8, i8* %x, i32 %index 740 %1 = bitcast i8* %0 to <16 x i8>* 741 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 742 %2 = zext <16 x i8> %wide.masked.load to <16 x i16> 743 %3 = getelementptr inbounds i8, i8* %y, i32 %index 744 %4 = bitcast i8* %3 to <16 x i8>* 745 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 746 %5 = zext <16 x i8> %wide.masked.load12 to <16 x i16> 747 %6 = mul nuw <16 x i16> %5, %2 748 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 749 %8 = trunc <16 x i16> %7 to <16 x i8> 750 %9 = getelementptr inbounds i8, i8* %d, i32 %index 751 %10 = bitcast i8* %9 to <16 x i8>* 752 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask) 753 %index.next = add i32 %index, 16 754 %11 = icmp eq i32 %index.next, %n.vec 755 br i1 %11, label %for.cond.cleanup, label %vector.body 756 757for.cond.cleanup: ; preds = %vector.body, %entry 758 ret void 759} 760 761declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 762declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 763declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 764declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 765declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) 766declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) 767declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 768declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) 769declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) 770