1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv7a-eabi -mattr=+neon -float-abi=hard %s -o - | FileCheck %s 3 4define <8 x i8> @vmlai8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 5; CHECK-LABEL: vmlai8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vmla.i8 d0, d1, d2 8; CHECK-NEXT: bx lr 9 %tmp4 = mul <8 x i8> %B, %C 10 %tmp5 = add <8 x i8> %A, %tmp4 11 ret <8 x i8> %tmp5 12} 13 14define <4 x i16> @vmlai16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) nounwind { 15; CHECK-LABEL: vmlai16: 16; CHECK: @ %bb.0: 17; CHECK-NEXT: vmla.i16 d0, d1, d2 18; CHECK-NEXT: bx lr 19 %tmp4 = mul <4 x i16> %B, %C 20 %tmp5 = add <4 x i16> %A, %tmp4 21 ret <4 x i16> %tmp5 22} 23 24define <2 x i32> @vmlai32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 25; CHECK-LABEL: vmlai32: 26; CHECK: @ %bb.0: 27; CHECK-NEXT: vmla.i32 d0, d1, d2 28; CHECK-NEXT: bx lr 29 %tmp4 = mul <2 x i32> %B, %C 30 %tmp5 = add <2 x i32> %A, %tmp4 31 ret <2 x i32> %tmp5 32} 33 34define <2 x float> @vmlaf32(<2 x float> %A, <2 x float> %B, <2 x float> %C) nounwind { 35; CHECK-LABEL: vmlaf32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vmla.f32 d0, d1, d2 38; CHECK-NEXT: bx lr 39 %tmp4 = fmul <2 x float> %B, %C 40 %tmp5 = fadd <2 x float> %A, %tmp4 41 ret <2 x float> %tmp5 42} 43 44define <16 x i8> @vmlaQi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind { 45; CHECK-LABEL: vmlaQi8: 46; CHECK: @ %bb.0: 47; CHECK-NEXT: vmla.i8 q0, q1, q2 48; CHECK-NEXT: bx lr 49 %tmp4 = mul <16 x i8> %B, %C 50 %tmp5 = add <16 x i8> %A, %tmp4 51 ret <16 x i8> %tmp5 52} 53 54define <8 x i16> @vmlaQi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) nounwind { 55; CHECK-LABEL: vmlaQi16: 56; CHECK: @ %bb.0: 57; CHECK-NEXT: vmla.i16 q0, q1, q2 58; CHECK-NEXT: bx lr 59 %tmp4 = mul <8 x i16> %B, %C 60 %tmp5 = add <8 x i16> %A, %tmp4 61 ret <8 x i16> %tmp5 62} 63 64define <4 x i32> @vmlaQi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) nounwind { 65; CHECK-LABEL: vmlaQi32: 66; CHECK: @ %bb.0: 67; CHECK-NEXT: vmla.i32 q0, q1, q2 68; CHECK-NEXT: bx lr 69 %tmp4 = mul <4 x i32> %B, %C 70 %tmp5 = add <4 x i32> %A, %tmp4 71 ret <4 x i32> %tmp5 72} 73 74define <4 x float> @vmlaQf32(<4 x float> %A, <4 x float> %B, <4 x float> %C) nounwind { 75; CHECK-LABEL: vmlaQf32: 76; CHECK: @ %bb.0: 77; CHECK-NEXT: vmla.f32 q0, q1, q2 78; CHECK-NEXT: bx lr 79 %tmp4 = fmul <4 x float> %B, %C 80 %tmp5 = fadd <4 x float> %A, %tmp4 81 ret <4 x float> %tmp5 82} 83 84define <8 x i16> @vmlals8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 85; CHECK-LABEL: vmlals8: 86; CHECK: @ %bb.0: 87; CHECK-NEXT: vmlal.s8 q0, d2, d3 88; CHECK-NEXT: bx lr 89 %tmp4 = sext <8 x i8> %B to <8 x i16> 90 %tmp5 = sext <8 x i8> %C to <8 x i16> 91 %tmp6 = mul <8 x i16> %tmp4, %tmp5 92 %tmp7 = add <8 x i16> %A, %tmp6 93 ret <8 x i16> %tmp7 94} 95 96define <4 x i32> @vmlals16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind { 97; CHECK-LABEL: vmlals16: 98; CHECK: @ %bb.0: 99; CHECK-NEXT: vmlal.s16 q0, d2, d3 100; CHECK-NEXT: bx lr 101 %tmp4 = sext <4 x i16> %B to <4 x i32> 102 %tmp5 = sext <4 x i16> %C to <4 x i32> 103 %tmp6 = mul <4 x i32> %tmp4, %tmp5 104 %tmp7 = add <4 x i32> %A, %tmp6 105 ret <4 x i32> %tmp7 106} 107 108define <2 x i64> @vmlals32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 109; CHECK-LABEL: vmlals32: 110; CHECK: @ %bb.0: 111; CHECK-NEXT: vmlal.s32 q0, d2, d3 112; CHECK-NEXT: bx lr 113 %tmp4 = sext <2 x i32> %B to <2 x i64> 114 %tmp5 = sext <2 x i32> %C to <2 x i64> 115 %tmp6 = mul <2 x i64> %tmp4, %tmp5 116 %tmp7 = add <2 x i64> %A, %tmp6 117 ret <2 x i64> %tmp7 118} 119 120define <8 x i16> @vmlalu8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 121; CHECK-LABEL: vmlalu8: 122; CHECK: @ %bb.0: 123; CHECK-NEXT: vmlal.u8 q0, d2, d3 124; CHECK-NEXT: bx lr 125 %tmp4 = zext <8 x i8> %B to <8 x i16> 126 %tmp5 = zext <8 x i8> %C to <8 x i16> 127 %tmp6 = mul <8 x i16> %tmp4, %tmp5 128 %tmp7 = add <8 x i16> %A, %tmp6 129 ret <8 x i16> %tmp7 130} 131 132define <4 x i32> @vmlalu16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind { 133; CHECK-LABEL: vmlalu16: 134; CHECK: @ %bb.0: 135; CHECK-NEXT: vmlal.u16 q0, d2, d3 136; CHECK-NEXT: bx lr 137 %tmp4 = zext <4 x i16> %B to <4 x i32> 138 %tmp5 = zext <4 x i16> %C to <4 x i32> 139 %tmp6 = mul <4 x i32> %tmp4, %tmp5 140 %tmp7 = add <4 x i32> %A, %tmp6 141 ret <4 x i32> %tmp7 142} 143 144define <2 x i64> @vmlalu32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 145; CHECK-LABEL: vmlalu32: 146; CHECK: @ %bb.0: 147; CHECK-NEXT: vmlal.u32 q0, d2, d3 148; CHECK-NEXT: bx lr 149 %tmp4 = zext <2 x i32> %B to <2 x i64> 150 %tmp5 = zext <2 x i32> %C to <2 x i64> 151 %tmp6 = mul <2 x i64> %tmp4, %tmp5 152 %tmp7 = add <2 x i64> %A, %tmp6 153 ret <2 x i64> %tmp7 154} 155 156define <8 x i16> @vmlala8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 157; CHECK-LABEL: vmlala8: 158; CHECK: @ %bb.0: 159; CHECK-NEXT: vmlal.u8 q0, d2, d3 160; CHECK-NEXT: vbic.i16 q0, #0xff00 161; CHECK-NEXT: bx lr 162 %tmp4 = zext <8 x i8> %B to <8 x i16> 163 %tmp5 = zext <8 x i8> %C to <8 x i16> 164 %tmp6 = mul <8 x i16> %tmp4, %tmp5 165 %tmp7 = add <8 x i16> %A, %tmp6 166 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 167 ret <8 x i16> %and 168} 169 170define <4 x i32> @vmlala16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind { 171; CHECK-LABEL: vmlala16: 172; CHECK: @ %bb.0: 173; CHECK-NEXT: vmlal.u16 q0, d2, d3 174; CHECK-NEXT: vmov.i32 q8, #0xffff 175; CHECK-NEXT: vand q0, q0, q8 176; CHECK-NEXT: bx lr 177 %tmp4 = zext <4 x i16> %B to <4 x i32> 178 %tmp5 = zext <4 x i16> %C to <4 x i32> 179 %tmp6 = mul <4 x i32> %tmp4, %tmp5 180 %tmp7 = add <4 x i32> %A, %tmp6 181 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535> 182 ret <4 x i32> %and 183} 184 185define <2 x i64> @vmlala32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 186; CHECK-LABEL: vmlala32: 187; CHECK: @ %bb.0: 188; CHECK-NEXT: vmlal.u32 q0, d2, d3 189; CHECK-NEXT: vmov.i64 q8, #0xffffffff 190; CHECK-NEXT: vand q0, q0, q8 191; CHECK-NEXT: bx lr 192 %tmp4 = zext <2 x i32> %B to <2 x i64> 193 %tmp5 = zext <2 x i32> %C to <2 x i64> 194 %tmp6 = mul <2 x i64> %tmp4, %tmp5 195 %tmp7 = add <2 x i64> %A, %tmp6 196 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295> 197 ret <2 x i64> %and 198} 199 200define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 201; CHECK-LABEL: test_vmlal_lanes16: 202; CHECK: @ %bb.0: @ %entry 203; CHECK-NEXT: vmlal.s16 q0, d2, d3[1] 204; CHECK-NEXT: bx lr 205entry: 206 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 207 %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32> 208 %2 = sext <4 x i16> %0 to <4 x i32> 209 %3 = mul <4 x i32> %1, %2 210 %4 = add <4 x i32> %arg0_int32x4_t, %3 211 ret <4 x i32> %4 212} 213 214define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 215; CHECK-LABEL: test_vmlal_lanes32: 216; CHECK: @ %bb.0: @ %entry 217; CHECK-NEXT: vmlal.s32 q0, d2, d3[1] 218; CHECK-NEXT: bx lr 219entry: 220 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 221 %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64> 222 %2 = sext <2 x i32> %0 to <2 x i64> 223 %3 = mul <2 x i64> %1, %2 224 %4 = add <2 x i64> %arg0_int64x2_t, %3 225 ret <2 x i64> %4 226} 227 228define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { 229; CHECK-LABEL: test_vmlal_laneu16: 230; CHECK: @ %bb.0: @ %entry 231; CHECK-NEXT: vmlal.u16 q0, d2, d3[1] 232; CHECK-NEXT: bx lr 233entry: 234 %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 235 %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32> 236 %2 = zext <4 x i16> %0 to <4 x i32> 237 %3 = mul <4 x i32> %1, %2 238 %4 = add <4 x i32> %arg0_uint32x4_t, %3 239 ret <4 x i32> %4 240} 241 242define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { 243; CHECK-LABEL: test_vmlal_laneu32: 244; CHECK: @ %bb.0: @ %entry 245; CHECK-NEXT: vmlal.u32 q0, d2, d3[1] 246; CHECK-NEXT: bx lr 247entry: 248 %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 249 %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64> 250 %2 = zext <2 x i32> %0 to <2 x i64> 251 %3 = mul <2 x i64> %1, %2 252 %4 = add <2 x i64> %arg0_uint64x2_t, %3 253 ret <2 x i64> %4 254} 255 256define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanea16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { 257; CHECK-LABEL: test_vmlal_lanea16: 258; CHECK: @ %bb.0: @ %entry 259; CHECK-NEXT: vmlal.u16 q0, d2, d3[1] 260; CHECK-NEXT: vmov.i32 q8, #0xffff 261; CHECK-NEXT: vand q0, q0, q8 262; CHECK-NEXT: bx lr 263entry: 264 %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 265 %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32> 266 %2 = zext <4 x i16> %0 to <4 x i32> 267 %3 = mul <4 x i32> %1, %2 268 %4 = add <4 x i32> %arg0_uint32x4_t, %3 269 %and = and <4 x i32> %4, <i32 65535, i32 65535, i32 65535, i32 65535> 270 ret <4 x i32> %and 271} 272 273define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanea32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { 274; CHECK-LABEL: test_vmlal_lanea32: 275; CHECK: @ %bb.0: @ %entry 276; CHECK-NEXT: vmlal.u32 q0, d2, d3[1] 277; CHECK-NEXT: vmov.i64 q8, #0xffffffff 278; CHECK-NEXT: vand q0, q0, q8 279; CHECK-NEXT: bx lr 280entry: 281 %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 282 %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64> 283 %2 = zext <2 x i32> %0 to <2 x i64> 284 %3 = mul <2 x i64> %1, %2 285 %4 = add <2 x i64> %arg0_uint64x2_t, %3 286 %and = and <2 x i64> %4, <i64 4294967295, i64 4294967295> 287 ret <2 x i64> %and 288} 289