1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @vmlau32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 5; CHECK-LABEL: vmlau32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmla.u32 q0, q1, r0 8; CHECK-NEXT: bx lr 9entry: 10 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 11 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 12 %2 = mul nsw <4 x i32> %B, %1 13 %3 = add nsw <4 x i32> %A, %2 14 ret <4 x i32> %3 15} 16 17define arm_aapcs_vfpcc <4 x i32> @vmlau32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 18; CHECK-LABEL: vmlau32b: 19; CHECK: @ %bb.0: @ %entry 20; CHECK-NEXT: vmla.u32 q0, q1, r0 21; CHECK-NEXT: bx lr 22entry: 23 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 24 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 25 %2 = mul nsw <4 x i32> %1, %B 26 %3 = add nsw <4 x i32> %2, %A 27 ret <4 x i32> %3 28} 29 30define arm_aapcs_vfpcc <8 x i16> @vmlau16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 31; CHECK-LABEL: vmlau16: 32; CHECK: @ %bb.0: @ %entry 33; CHECK-NEXT: vmla.u16 q0, q1, r0 34; CHECK-NEXT: bx lr 35entry: 36 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 37 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 38 %2 = mul nsw <8 x i16> %B, %1 39 %3 = add nsw <8 x i16> %A, %2 40 ret <8 x i16> %3 41} 42 43define arm_aapcs_vfpcc <8 x i16> @vmlau16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 44; CHECK-LABEL: vmlau16b: 45; CHECK: @ %bb.0: @ %entry 46; CHECK-NEXT: vmla.u16 q0, q1, r0 47; CHECK-NEXT: bx lr 48entry: 49 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 50 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 51 %2 = mul nsw <8 x i16> %1, %B 52 %3 = add nsw <8 x i16> %2, %A 53 ret <8 x i16> %3 54} 55 56define arm_aapcs_vfpcc <16 x i8> @vmlau8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 57; CHECK-LABEL: vmlau8: 58; CHECK: @ %bb.0: @ %entry 59; CHECK-NEXT: vmla.u8 q0, q1, r0 60; CHECK-NEXT: bx lr 61entry: 62 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 63 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 64 %2 = mul nsw <16 x i8> %B, %1 65 %3 = add nsw <16 x i8> %A, %2 66 ret <16 x i8> %3 67} 68 69define arm_aapcs_vfpcc <16 x i8> @vmlau8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 70; CHECK-LABEL: vmlau8b: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: vmla.u8 q0, q1, r0 73; CHECK-NEXT: bx lr 74entry: 75 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 76 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 77 %2 = mul nsw <16 x i8> %1, %B 78 %3 = add nsw <16 x i8> %2, %A 79 ret <16 x i8> %3 80} 81 82define void @vmla32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) { 83; CHECK-LABEL: vmla32_in_loop: 84; CHECK: @ %bb.0: @ %entry 85; CHECK-NEXT: .LBB6_1: @ %vector.body 86; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 87; CHECK-NEXT: vldrw.u32 q0, [r0], #16 88; CHECK-NEXT: vldrw.u32 q1, [r2] 89; CHECK-NEXT: subs r3, #4 90; CHECK-NEXT: vmla.u32 q1, q0, r1 91; CHECK-NEXT: vstrb.8 q1, [r2], #16 92; CHECK-NEXT: bne .LBB6_1 93; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 94; CHECK-NEXT: bx lr 95entry: 96 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0 97 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 98 br label %vector.body 99 100vector.body: 101 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 102 %0 = getelementptr inbounds i32, i32* %s1, i32 %index 103 %1 = bitcast i32* %0 to <4 x i32>* 104 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 105 %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9 106 %3 = getelementptr inbounds i32, i32* %d, i32 %index 107 %4 = bitcast i32* %3 to <4 x i32>* 108 %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4 109 %5 = add nsw <4 x i32> %wide.load10, %2 110 %6 = bitcast i32* %3 to <4 x i32>* 111 store <4 x i32> %5, <4 x i32>* %6, align 4 112 %index.next = add i32 %index, 4 113 %7 = icmp eq i32 %index.next, %n 114 br i1 %7, label %for.cond.cleanup, label %vector.body 115 116for.cond.cleanup: 117 ret void 118} 119 120define void @vmla16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) { 121; CHECK-LABEL: vmla16_in_loop: 122; CHECK: @ %bb.0: @ %entry 123; CHECK-NEXT: .LBB7_1: @ %vector.body 124; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 125; CHECK-NEXT: vldrh.u16 q0, [r0], #16 126; CHECK-NEXT: vldrh.u16 q1, [r2] 127; CHECK-NEXT: subs r3, #8 128; CHECK-NEXT: vmla.u16 q1, q0, r1 129; CHECK-NEXT: vstrb.8 q1, [r2], #16 130; CHECK-NEXT: bne .LBB7_1 131; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 132; CHECK-NEXT: bx lr 133entry: 134 %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0 135 %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer 136 br label %vector.body 137 138vector.body: 139 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 140 %0 = getelementptr inbounds i16, i16* %s1, i32 %index 141 %1 = bitcast i16* %0 to <8 x i16>* 142 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 143 %2 = mul <8 x i16> %wide.load, %broadcast.splat12 144 %3 = getelementptr inbounds i16, i16* %d, i32 %index 145 %4 = bitcast i16* %3 to <8 x i16>* 146 %wide.load13 = load <8 x i16>, <8 x i16>* %4, align 2 147 %5 = add <8 x i16> %2, %wide.load13 148 %6 = bitcast i16* %3 to <8 x i16>* 149 store <8 x i16> %5, <8 x i16>* %6, align 2 150 %index.next = add i32 %index, 8 151 %7 = icmp eq i32 %index.next, %n 152 br i1 %7, label %for.cond.cleanup, label %vector.body 153 154for.cond.cleanup: 155 ret void 156} 157 158define void @vmla8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) { 159; CHECK-LABEL: vmla8_in_loop: 160; CHECK: @ %bb.0: @ %entry 161; CHECK-NEXT: .LBB8_1: @ %vector.body 162; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 163; CHECK-NEXT: vldrh.u16 q0, [r0], #16 164; CHECK-NEXT: vldrh.u16 q1, [r2] 165; CHECK-NEXT: subs r3, #16 166; CHECK-NEXT: vmla.u8 q1, q0, r1 167; CHECK-NEXT: vstrb.8 q1, [r2], #16 168; CHECK-NEXT: bne .LBB8_1 169; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 170; CHECK-NEXT: bx lr 171entry: 172 %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0 173 %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer 174 br label %vector.body 175 176vector.body: 177 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 178 %0 = getelementptr inbounds i8, i8* %s1, i32 %index 179 %1 = bitcast i8* %0 to <16 x i8>* 180 %wide.load = load <16 x i8>, <16 x i8>* %1, align 2 181 %2 = mul <16 x i8> %wide.load, %broadcast.splat12 182 %3 = getelementptr inbounds i8, i8* %d, i32 %index 183 %4 = bitcast i8* %3 to <16 x i8>* 184 %wide.load13 = load <16 x i8>, <16 x i8>* %4, align 2 185 %5 = add <16 x i8> %2, %wide.load13 186 %6 = bitcast i8* %3 to <16 x i8>* 187 store <16 x i8> %5, <16 x i8>* %6, align 2 188 %index.next = add i32 %index, 16 189 %7 = icmp eq i32 %index.next, %n 190 br i1 %7, label %for.cond.cleanup, label %vector.body 191 192for.cond.cleanup: 193 ret void 194} 195 196 197define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 198; CHECK-LABEL: vmlasu32: 199; CHECK: @ %bb.0: @ %entry 200; CHECK-NEXT: vmlas.u32 q0, q1, r0 201; CHECK-NEXT: bx lr 202entry: 203 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 204 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 205 %2 = mul nsw <4 x i32> %A, %B 206 %3 = add nsw <4 x i32> %1, %2 207 ret <4 x i32> %3 208} 209 210define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 211; CHECK-LABEL: vmlasu32b: 212; CHECK: @ %bb.0: @ %entry 213; CHECK-NEXT: vmlas.u32 q0, q1, r0 214; CHECK-NEXT: bx lr 215entry: 216 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 217 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 218 %2 = mul nsw <4 x i32> %A, %B 219 %3 = add nsw <4 x i32> %2, %1 220 ret <4 x i32> %3 221} 222 223define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 224; CHECK-LABEL: vmlasu16: 225; CHECK: @ %bb.0: @ %entry 226; CHECK-NEXT: vmlas.u16 q0, q1, r0 227; CHECK-NEXT: bx lr 228entry: 229 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 230 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 231 %2 = mul nsw <8 x i16> %A, %B 232 %3 = add nsw <8 x i16> %1, %2 233 ret <8 x i16> %3 234} 235 236define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 237; CHECK-LABEL: vmlasu16b: 238; CHECK: @ %bb.0: @ %entry 239; CHECK-NEXT: vmlas.u16 q0, q1, r0 240; CHECK-NEXT: bx lr 241entry: 242 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 243 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 244 %2 = mul nsw <8 x i16> %A, %B 245 %3 = add nsw <8 x i16> %2, %1 246 ret <8 x i16> %3 247} 248 249define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 250; CHECK-LABEL: vmlasu8: 251; CHECK: @ %bb.0: @ %entry 252; CHECK-NEXT: vmlas.u8 q0, q1, r0 253; CHECK-NEXT: bx lr 254entry: 255 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 256 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 257 %2 = mul nsw <16 x i8> %A, %B 258 %3 = add nsw <16 x i8> %1, %2 259 ret <16 x i8> %3 260} 261 262define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 263; CHECK-LABEL: vmlasu8b: 264; CHECK: @ %bb.0: @ %entry 265; CHECK-NEXT: vmlas.u8 q0, q1, r0 266; CHECK-NEXT: bx lr 267entry: 268 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 269 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 270 %2 = mul nsw <16 x i8> %A, %B 271 %3 = add nsw <16 x i8> %2, %1 272 ret <16 x i8> %3 273} 274 275define void @vmlas32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) { 276; CHECK-LABEL: vmlas32_in_loop: 277; CHECK: @ %bb.0: @ %entry 278; CHECK-NEXT: .LBB15_1: @ %vector.body 279; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 280; CHECK-NEXT: vldrw.u32 q0, [r2] 281; CHECK-NEXT: vldrw.u32 q1, [r0], #16 282; CHECK-NEXT: subs r3, #4 283; CHECK-NEXT: vmlas.u32 q1, q0, r1 284; CHECK-NEXT: vstrb.8 q1, [r2], #16 285; CHECK-NEXT: bne .LBB15_1 286; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 287; CHECK-NEXT: bx lr 288entry: 289 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0 290 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 291 br label %vector.body 292 293vector.body: 294 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 295 %0 = getelementptr inbounds i32, i32* %s1, i32 %index 296 %1 = bitcast i32* %0 to <4 x i32>* 297 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 298 %2 = getelementptr inbounds i32, i32* %d, i32 %index 299 %3 = bitcast i32* %2 to <4 x i32>* 300 %wide.load10 = load <4 x i32>, <4 x i32>* %3, align 4 301 %4 = mul nsw <4 x i32> %wide.load, %wide.load10 302 %5 = add nsw <4 x i32> %broadcast.splat9, %4 303 %6 = bitcast i32* %2 to <4 x i32>* 304 store <4 x i32> %5, <4 x i32>* %6, align 4 305 %index.next = add i32 %index, 4 306 %7 = icmp eq i32 %index.next, %n 307 br i1 %7, label %for.cond.cleanup, label %vector.body 308 309for.cond.cleanup: 310 ret void 311} 312 313define void @vmlas16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) { 314; CHECK-LABEL: vmlas16_in_loop: 315; CHECK: @ %bb.0: @ %entry 316; CHECK-NEXT: .LBB16_1: @ %vector.body 317; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 318; CHECK-NEXT: vldrh.u16 q0, [r2] 319; CHECK-NEXT: vldrh.u16 q1, [r0], #16 320; CHECK-NEXT: subs r3, #8 321; CHECK-NEXT: vmlas.u16 q1, q0, r1 322; CHECK-NEXT: vstrb.8 q1, [r2], #16 323; CHECK-NEXT: bne .LBB16_1 324; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 325; CHECK-NEXT: bx lr 326entry: 327 %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0 328 %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer 329 br label %vector.body 330 331vector.body: 332 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 333 %0 = getelementptr inbounds i16, i16* %s1, i32 %index 334 %1 = bitcast i16* %0 to <8 x i16>* 335 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 336 %2 = getelementptr inbounds i16, i16* %d, i32 %index 337 %3 = bitcast i16* %2 to <8 x i16>* 338 %wide.load13 = load <8 x i16>, <8 x i16>* %3, align 2 339 %4 = mul <8 x i16> %wide.load, %wide.load13 340 %5 = add <8 x i16> %4, %broadcast.splat12 341 %6 = bitcast i16* %2 to <8 x i16>* 342 store <8 x i16> %5, <8 x i16>* %6, align 2 343 %index.next = add i32 %index, 8 344 %7 = icmp eq i32 %index.next, %n 345 br i1 %7, label %for.cond.cleanup, label %vector.body 346 347for.cond.cleanup: 348 ret void 349} 350 351define void @vmlas8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) { 352; CHECK-LABEL: vmlas8_in_loop: 353; CHECK: @ %bb.0: @ %entry 354; CHECK-NEXT: .LBB17_1: @ %vector.body 355; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 356; CHECK-NEXT: vldrh.u16 q0, [r2] 357; CHECK-NEXT: vldrh.u16 q1, [r0], #16 358; CHECK-NEXT: subs r3, #16 359; CHECK-NEXT: vmlas.u8 q1, q0, r1 360; CHECK-NEXT: vstrb.8 q1, [r2], #16 361; CHECK-NEXT: bne .LBB17_1 362; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 363; CHECK-NEXT: bx lr 364entry: 365 %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0 366 %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer 367 br label %vector.body 368 369vector.body: 370 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 371 %0 = getelementptr inbounds i8, i8* %s1, i32 %index 372 %1 = bitcast i8* %0 to <16 x i8>* 373 %wide.load = load <16 x i8>, <16 x i8>* %1, align 2 374 %2 = getelementptr inbounds i8, i8* %d, i32 %index 375 %3 = bitcast i8* %2 to <16 x i8>* 376 %wide.load13 = load <16 x i8>, <16 x i8>* %3, align 2 377 %4 = mul <16 x i8> %wide.load, %wide.load13 378 %5 = add <16 x i8> %4, %broadcast.splat12 379 %6 = bitcast i8* %2 to <16 x i8>* 380 store <16 x i8> %5, <16 x i8>* %6, align 2 381 %index.next = add i32 %index, 16 382 %7 = icmp eq i32 %index.next, %n 383 br i1 %7, label %for.cond.cleanup, label %vector.body 384 385for.cond.cleanup: 386 ret void 387} 388