1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4; Various reductions generated fro SLP vectorizing unrolled loops. Generated 5; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed. 6 7define i32 @addv2i32i32(i32* %x) { 8; CHECK-LABEL: addv2i32i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: ldrd r1, r0, [r0] 11; CHECK-NEXT: add r0, r1 12; CHECK-NEXT: bx lr 13entry: 14 %0 = load i32, i32* %x, align 4 15 %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1 16 %1 = load i32, i32* %arrayidx.1, align 4 17 %add.1 = add nsw i32 %1, %0 18 ret i32 %add.1 19} 20 21define i32 @addv4i32i32(i32* %x) { 22; CHECK-LABEL: addv4i32i32: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vldrw.u32 q0, [r0] 25; CHECK-NEXT: vaddv.u32 r0, q0 26; CHECK-NEXT: bx lr 27entry: 28 %0 = bitcast i32* %x to <4 x i32>* 29 %1 = load <4 x i32>, <4 x i32>* %0, align 4 30 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 31 ret i32 %2 32} 33 34define i32 @addv8i32i32(i32* %x) { 35; CHECK-LABEL: addv8i32i32: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: vldrw.u32 q1, [r0] 38; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 39; CHECK-NEXT: vaddv.u32 r0, q1 40; CHECK-NEXT: vaddva.u32 r0, q0 41; CHECK-NEXT: bx lr 42entry: 43 %0 = bitcast i32* %x to <8 x i32>* 44 %1 = load <8 x i32>, <8 x i32>* %0, align 4 45 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 46 ret i32 %2 47} 48 49define i32 @addv16i32i32(i32* %x) { 50; CHECK-LABEL: addv16i32i32: 51; CHECK: @ %bb.0: @ %entry 52; CHECK-NEXT: vldrw.u32 q1, [r0] 53; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 54; CHECK-NEXT: vaddv.u32 r2, q1 55; CHECK-NEXT: vaddva.u32 r2, q0 56; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 57; CHECK-NEXT: vaddva.u32 r2, q0 58; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 59; CHECK-NEXT: vaddva.u32 r2, q0 60; CHECK-NEXT: mov r0, r2 61; CHECK-NEXT: bx lr 62entry: 63 %0 = bitcast i32* %x to <16 x i32>* 64 %1 = load <16 x i32>, <16 x i32>* %0, align 4 65 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 66 ret i32 %2 67} 68 69define i32 @addv24i32i32(i32* %x) { 70; CHECK-LABEL: addv24i32i32: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: vldrw.u32 q1, [r0] 73; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 74; CHECK-NEXT: vaddv.u32 r2, q1 75; CHECK-NEXT: vaddva.u32 r2, q0 76; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 77; CHECK-NEXT: vaddva.u32 r2, q0 78; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 79; CHECK-NEXT: vaddva.u32 r2, q0 80; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 81; CHECK-NEXT: vaddva.u32 r2, q0 82; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 83; CHECK-NEXT: vaddva.u32 r2, q0 84; CHECK-NEXT: mov r0, r2 85; CHECK-NEXT: bx lr 86entry: 87 %0 = bitcast i32* %x to <8 x i32>* 88 %1 = load <8 x i32>, <8 x i32>* %0, align 4 89 %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8 90 %2 = bitcast i32* %arrayidx.8 to <16 x i32>* 91 %3 = load <16 x i32>, <16 x i32>* %2, align 4 92 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 93 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 94 %op.rdx = add nsw i32 %4, %5 95 ret i32 %op.rdx 96} 97 98define i32 @addv32i32i32(i32* %x) { 99; CHECK-LABEL: addv32i32i32: 100; CHECK: @ %bb.0: @ %entry 101; CHECK-NEXT: vldrw.u32 q1, [r0] 102; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 103; CHECK-NEXT: mov r1, r0 104; CHECK-NEXT: vaddv.u32 r0, q1 105; CHECK-NEXT: vaddva.u32 r0, q0 106; CHECK-NEXT: vldrw.u32 q0, [r1, #32] 107; CHECK-NEXT: vaddva.u32 r0, q0 108; CHECK-NEXT: vldrw.u32 q0, [r1, #48] 109; CHECK-NEXT: vaddva.u32 r0, q0 110; CHECK-NEXT: vldrw.u32 q0, [r1, #64] 111; CHECK-NEXT: vaddva.u32 r0, q0 112; CHECK-NEXT: vldrw.u32 q0, [r1, #80] 113; CHECK-NEXT: vaddva.u32 r0, q0 114; CHECK-NEXT: vldrw.u32 q0, [r1, #96] 115; CHECK-NEXT: vaddva.u32 r0, q0 116; CHECK-NEXT: vldrw.u32 q0, [r1, #112] 117; CHECK-NEXT: vaddva.u32 r0, q0 118; CHECK-NEXT: bx lr 119entry: 120 %0 = bitcast i32* %x to <32 x i32>* 121 %1 = load <32 x i32>, <32 x i32>* %0, align 4 122 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 123 ret i32 %2 124} 125 126define i32 @addv64i32i32(i32* %x) { 127; CHECK-LABEL: addv64i32i32: 128; CHECK: @ %bb.0: @ %entry 129; CHECK-NEXT: vldrw.u32 q1, [r0] 130; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 131; CHECK-NEXT: vaddv.u32 r2, q1 132; CHECK-NEXT: vaddva.u32 r2, q0 133; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 134; CHECK-NEXT: vaddva.u32 r2, q0 135; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 136; CHECK-NEXT: vaddva.u32 r2, q0 137; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 138; CHECK-NEXT: vaddva.u32 r2, q0 139; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 140; CHECK-NEXT: vaddva.u32 r2, q0 141; CHECK-NEXT: vldrw.u32 q0, [r0, #96] 142; CHECK-NEXT: vaddva.u32 r2, q0 143; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 144; CHECK-NEXT: vaddva.u32 r2, q0 145; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 146; CHECK-NEXT: vaddva.u32 r2, q0 147; CHECK-NEXT: vldrw.u32 q0, [r0, #144] 148; CHECK-NEXT: vaddva.u32 r2, q0 149; CHECK-NEXT: vldrw.u32 q0, [r0, #160] 150; CHECK-NEXT: vaddva.u32 r2, q0 151; CHECK-NEXT: vldrw.u32 q0, [r0, #176] 152; CHECK-NEXT: vaddva.u32 r2, q0 153; CHECK-NEXT: vldrw.u32 q0, [r0, #192] 154; CHECK-NEXT: vaddva.u32 r2, q0 155; CHECK-NEXT: vldrw.u32 q0, [r0, #208] 156; CHECK-NEXT: vaddva.u32 r2, q0 157; CHECK-NEXT: vldrw.u32 q0, [r0, #224] 158; CHECK-NEXT: vaddva.u32 r2, q0 159; CHECK-NEXT: vldrw.u32 q0, [r0, #240] 160; CHECK-NEXT: vaddva.u32 r2, q0 161; CHECK-NEXT: mov r0, r2 162; CHECK-NEXT: bx lr 163entry: 164 %0 = bitcast i32* %x to <64 x i32>* 165 %1 = load <64 x i32>, <64 x i32>* %0, align 4 166 %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1) 167 ret i32 %2 168} 169 170define i32 @addv128i32i32(i32* %x) { 171; CHECK-LABEL: addv128i32i32: 172; CHECK: @ %bb.0: @ %entry 173; CHECK-NEXT: vldrw.u32 q1, [r0] 174; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 175; CHECK-NEXT: vaddv.u32 r2, q1 176; CHECK-NEXT: vaddva.u32 r2, q0 177; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 178; CHECK-NEXT: vaddva.u32 r2, q0 179; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 180; CHECK-NEXT: vaddva.u32 r2, q0 181; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 182; CHECK-NEXT: vaddva.u32 r2, q0 183; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 184; CHECK-NEXT: vaddva.u32 r2, q0 185; CHECK-NEXT: vldrw.u32 q0, [r0, #96] 186; CHECK-NEXT: vaddva.u32 r2, q0 187; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 188; CHECK-NEXT: vaddva.u32 r2, q0 189; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 190; CHECK-NEXT: vaddva.u32 r2, q0 191; CHECK-NEXT: vldrw.u32 q0, [r0, #144] 192; CHECK-NEXT: vaddva.u32 r2, q0 193; CHECK-NEXT: vldrw.u32 q0, [r0, #160] 194; CHECK-NEXT: vaddva.u32 r2, q0 195; CHECK-NEXT: vldrw.u32 q0, [r0, #176] 196; CHECK-NEXT: vaddva.u32 r2, q0 197; CHECK-NEXT: vldrw.u32 q0, [r0, #192] 198; CHECK-NEXT: vaddva.u32 r2, q0 199; CHECK-NEXT: vldrw.u32 q0, [r0, #208] 200; CHECK-NEXT: vaddva.u32 r2, q0 201; CHECK-NEXT: vldrw.u32 q0, [r0, #224] 202; CHECK-NEXT: vaddva.u32 r2, q0 203; CHECK-NEXT: vldrw.u32 q0, [r0, #240] 204; CHECK-NEXT: vaddva.u32 r2, q0 205; CHECK-NEXT: vldrw.u32 q0, [r0, #256] 206; CHECK-NEXT: vaddva.u32 r2, q0 207; CHECK-NEXT: vldrw.u32 q0, [r0, #272] 208; CHECK-NEXT: vaddva.u32 r2, q0 209; CHECK-NEXT: vldrw.u32 q0, [r0, #288] 210; CHECK-NEXT: vaddva.u32 r2, q0 211; CHECK-NEXT: vldrw.u32 q0, [r0, #304] 212; CHECK-NEXT: vaddva.u32 r2, q0 213; CHECK-NEXT: vldrw.u32 q0, [r0, #320] 214; CHECK-NEXT: vaddva.u32 r2, q0 215; CHECK-NEXT: vldrw.u32 q0, [r0, #336] 216; CHECK-NEXT: vaddva.u32 r2, q0 217; CHECK-NEXT: vldrw.u32 q0, [r0, #352] 218; CHECK-NEXT: vaddva.u32 r2, q0 219; CHECK-NEXT: vldrw.u32 q0, [r0, #368] 220; CHECK-NEXT: vaddva.u32 r2, q0 221; CHECK-NEXT: vldrw.u32 q0, [r0, #384] 222; CHECK-NEXT: vaddva.u32 r2, q0 223; CHECK-NEXT: vldrw.u32 q0, [r0, #400] 224; CHECK-NEXT: vaddva.u32 r2, q0 225; CHECK-NEXT: vldrw.u32 q0, [r0, #416] 226; CHECK-NEXT: vaddva.u32 r2, q0 227; CHECK-NEXT: vldrw.u32 q0, [r0, #432] 228; CHECK-NEXT: vaddva.u32 r2, q0 229; CHECK-NEXT: vldrw.u32 q0, [r0, #448] 230; CHECK-NEXT: vaddva.u32 r2, q0 231; CHECK-NEXT: vldrw.u32 q0, [r0, #464] 232; CHECK-NEXT: vaddva.u32 r2, q0 233; CHECK-NEXT: vldrw.u32 q0, [r0, #480] 234; CHECK-NEXT: vaddva.u32 r2, q0 235; CHECK-NEXT: vldrw.u32 q0, [r0, #496] 236; CHECK-NEXT: vaddva.u32 r2, q0 237; CHECK-NEXT: mov r0, r2 238; CHECK-NEXT: bx lr 239entry: 240 %0 = bitcast i32* %x to <4 x i32>* 241 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4 242 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) 243 %2 = getelementptr inbounds i32, i32* %x, i32 4 244 %3 = bitcast i32* %2 to <4 x i32>* 245 %wide.load.1 = load <4 x i32>, <4 x i32>* %3, align 4 246 %4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1) 247 %5 = add i32 %4, %1 248 %6 = getelementptr inbounds i32, i32* %x, i32 8 249 %7 = bitcast i32* %6 to <4 x i32>* 250 %wide.load.2 = load <4 x i32>, <4 x i32>* %7, align 4 251 %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2) 252 %9 = add i32 %8, %5 253 %10 = getelementptr inbounds i32, i32* %x, i32 12 254 %11 = bitcast i32* %10 to <4 x i32>* 255 %wide.load.3 = load <4 x i32>, <4 x i32>* %11, align 4 256 %12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3) 257 %13 = add i32 %12, %9 258 %14 = getelementptr inbounds i32, i32* %x, i32 16 259 %15 = bitcast i32* %14 to <4 x i32>* 260 %wide.load.4 = load <4 x i32>, <4 x i32>* %15, align 4 261 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4) 262 %17 = add i32 %16, %13 263 %18 = getelementptr inbounds i32, i32* %x, i32 20 264 %19 = bitcast i32* %18 to <4 x i32>* 265 %wide.load.5 = load <4 x i32>, <4 x i32>* %19, align 4 266 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5) 267 %21 = add i32 %20, %17 268 %22 = getelementptr inbounds i32, i32* %x, i32 24 269 %23 = bitcast i32* %22 to <4 x i32>* 270 %wide.load.6 = load <4 x i32>, <4 x i32>* %23, align 4 271 %24 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6) 272 %25 = add i32 %24, %21 273 %26 = getelementptr inbounds i32, i32* %x, i32 28 274 %27 = bitcast i32* %26 to <4 x i32>* 275 %wide.load.7 = load <4 x i32>, <4 x i32>* %27, align 4 276 %28 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7) 277 %29 = add i32 %28, %25 278 %30 = getelementptr inbounds i32, i32* %x, i32 32 279 %31 = bitcast i32* %30 to <4 x i32>* 280 %wide.load.8 = load <4 x i32>, <4 x i32>* %31, align 4 281 %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8) 282 %33 = add i32 %32, %29 283 %34 = getelementptr inbounds i32, i32* %x, i32 36 284 %35 = bitcast i32* %34 to <4 x i32>* 285 %wide.load.9 = load <4 x i32>, <4 x i32>* %35, align 4 286 %36 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9) 287 %37 = add i32 %36, %33 288 %38 = getelementptr inbounds i32, i32* %x, i32 40 289 %39 = bitcast i32* %38 to <4 x i32>* 290 %wide.load.10 = load <4 x i32>, <4 x i32>* %39, align 4 291 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10) 292 %41 = add i32 %40, %37 293 %42 = getelementptr inbounds i32, i32* %x, i32 44 294 %43 = bitcast i32* %42 to <4 x i32>* 295 %wide.load.11 = load <4 x i32>, <4 x i32>* %43, align 4 296 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11) 297 %45 = add i32 %44, %41 298 %46 = getelementptr inbounds i32, i32* %x, i32 48 299 %47 = bitcast i32* %46 to <4 x i32>* 300 %wide.load.12 = load <4 x i32>, <4 x i32>* %47, align 4 301 %48 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12) 302 %49 = add i32 %48, %45 303 %50 = getelementptr inbounds i32, i32* %x, i32 52 304 %51 = bitcast i32* %50 to <4 x i32>* 305 %wide.load.13 = load <4 x i32>, <4 x i32>* %51, align 4 306 %52 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13) 307 %53 = add i32 %52, %49 308 %54 = getelementptr inbounds i32, i32* %x, i32 56 309 %55 = bitcast i32* %54 to <4 x i32>* 310 %wide.load.14 = load <4 x i32>, <4 x i32>* %55, align 4 311 %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14) 312 %57 = add i32 %56, %53 313 %58 = getelementptr inbounds i32, i32* %x, i32 60 314 %59 = bitcast i32* %58 to <4 x i32>* 315 %wide.load.15 = load <4 x i32>, <4 x i32>* %59, align 4 316 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15) 317 %61 = add i32 %60, %57 318 %62 = getelementptr inbounds i32, i32* %x, i32 64 319 %63 = bitcast i32* %62 to <4 x i32>* 320 %wide.load.16 = load <4 x i32>, <4 x i32>* %63, align 4 321 %64 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16) 322 %65 = add i32 %64, %61 323 %66 = getelementptr inbounds i32, i32* %x, i32 68 324 %67 = bitcast i32* %66 to <4 x i32>* 325 %wide.load.17 = load <4 x i32>, <4 x i32>* %67, align 4 326 %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17) 327 %69 = add i32 %68, %65 328 %70 = getelementptr inbounds i32, i32* %x, i32 72 329 %71 = bitcast i32* %70 to <4 x i32>* 330 %wide.load.18 = load <4 x i32>, <4 x i32>* %71, align 4 331 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18) 332 %73 = add i32 %72, %69 333 %74 = getelementptr inbounds i32, i32* %x, i32 76 334 %75 = bitcast i32* %74 to <4 x i32>* 335 %wide.load.19 = load <4 x i32>, <4 x i32>* %75, align 4 336 %76 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19) 337 %77 = add i32 %76, %73 338 %78 = getelementptr inbounds i32, i32* %x, i32 80 339 %79 = bitcast i32* %78 to <4 x i32>* 340 %wide.load.20 = load <4 x i32>, <4 x i32>* %79, align 4 341 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20) 342 %81 = add i32 %80, %77 343 %82 = getelementptr inbounds i32, i32* %x, i32 84 344 %83 = bitcast i32* %82 to <4 x i32>* 345 %wide.load.21 = load <4 x i32>, <4 x i32>* %83, align 4 346 %84 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21) 347 %85 = add i32 %84, %81 348 %86 = getelementptr inbounds i32, i32* %x, i32 88 349 %87 = bitcast i32* %86 to <4 x i32>* 350 %wide.load.22 = load <4 x i32>, <4 x i32>* %87, align 4 351 %88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22) 352 %89 = add i32 %88, %85 353 %90 = getelementptr inbounds i32, i32* %x, i32 92 354 %91 = bitcast i32* %90 to <4 x i32>* 355 %wide.load.23 = load <4 x i32>, <4 x i32>* %91, align 4 356 %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23) 357 %93 = add i32 %92, %89 358 %94 = getelementptr inbounds i32, i32* %x, i32 96 359 %95 = bitcast i32* %94 to <4 x i32>* 360 %wide.load.24 = load <4 x i32>, <4 x i32>* %95, align 4 361 %96 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24) 362 %97 = add i32 %96, %93 363 %98 = getelementptr inbounds i32, i32* %x, i32 100 364 %99 = bitcast i32* %98 to <4 x i32>* 365 %wide.load.25 = load <4 x i32>, <4 x i32>* %99, align 4 366 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25) 367 %101 = add i32 %100, %97 368 %102 = getelementptr inbounds i32, i32* %x, i32 104 369 %103 = bitcast i32* %102 to <4 x i32>* 370 %wide.load.26 = load <4 x i32>, <4 x i32>* %103, align 4 371 %104 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26) 372 %105 = add i32 %104, %101 373 %106 = getelementptr inbounds i32, i32* %x, i32 108 374 %107 = bitcast i32* %106 to <4 x i32>* 375 %wide.load.27 = load <4 x i32>, <4 x i32>* %107, align 4 376 %108 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27) 377 %109 = add i32 %108, %105 378 %110 = getelementptr inbounds i32, i32* %x, i32 112 379 %111 = bitcast i32* %110 to <4 x i32>* 380 %wide.load.28 = load <4 x i32>, <4 x i32>* %111, align 4 381 %112 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28) 382 %113 = add i32 %112, %109 383 %114 = getelementptr inbounds i32, i32* %x, i32 116 384 %115 = bitcast i32* %114 to <4 x i32>* 385 %wide.load.29 = load <4 x i32>, <4 x i32>* %115, align 4 386 %116 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29) 387 %117 = add i32 %116, %113 388 %118 = getelementptr inbounds i32, i32* %x, i32 120 389 %119 = bitcast i32* %118 to <4 x i32>* 390 %wide.load.30 = load <4 x i32>, <4 x i32>* %119, align 4 391 %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30) 392 %121 = add i32 %120, %117 393 %122 = getelementptr inbounds i32, i32* %x, i32 124 394 %123 = bitcast i32* %122 to <4 x i32>* 395 %wide.load.31 = load <4 x i32>, <4 x i32>* %123, align 4 396 %124 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31) 397 %125 = add i32 %124, %121 398 ret i32 %125 399} 400 401define i32 @addv2i32i16(i16* %x) { 402; CHECK-LABEL: addv2i32i16: 403; CHECK: @ %bb.0: @ %entry 404; CHECK-NEXT: ldrsh.w r1, [r0] 405; CHECK-NEXT: ldrsh.w r0, [r0, #2] 406; CHECK-NEXT: add r0, r1 407; CHECK-NEXT: bx lr 408entry: 409 %0 = load i16, i16* %x, align 2 410 %conv = sext i16 %0 to i32 411 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1 412 %1 = load i16, i16* %arrayidx.1, align 2 413 %conv.1 = sext i16 %1 to i32 414 %add.1 = add nsw i32 %conv, %conv.1 415 ret i32 %add.1 416} 417 418define i32 @addv4i32i16(i16* %x) { 419; CHECK-LABEL: addv4i32i16: 420; CHECK: @ %bb.0: @ %entry 421; CHECK-NEXT: vldrh.s32 q0, [r0] 422; CHECK-NEXT: vaddv.u32 r0, q0 423; CHECK-NEXT: bx lr 424entry: 425 %0 = bitcast i16* %x to <4 x i16>* 426 %1 = load <4 x i16>, <4 x i16>* %0, align 2 427 %2 = sext <4 x i16> %1 to <4 x i32> 428 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 429 ret i32 %3 430} 431 432define i32 @addv8i32i16(i16* %x) { 433; CHECK-LABEL: addv8i32i16: 434; CHECK: @ %bb.0: @ %entry 435; CHECK-NEXT: vldrh.u16 q0, [r0] 436; CHECK-NEXT: vaddv.s16 r0, q0 437; CHECK-NEXT: bx lr 438entry: 439 %0 = bitcast i16* %x to <8 x i16>* 440 %1 = load <8 x i16>, <8 x i16>* %0, align 2 441 %2 = sext <8 x i16> %1 to <8 x i32> 442 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 443 ret i32 %3 444} 445 446define i32 @addv16i32i16(i16* %x) { 447; CHECK-LABEL: addv16i32i16: 448; CHECK: @ %bb.0: @ %entry 449; CHECK-NEXT: vldrh.s32 q1, [r0] 450; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 451; CHECK-NEXT: vaddv.u32 r2, q1 452; CHECK-NEXT: vaddva.u32 r2, q0 453; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 454; CHECK-NEXT: vaddva.u32 r2, q0 455; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 456; CHECK-NEXT: vaddva.u32 r2, q0 457; CHECK-NEXT: mov r0, r2 458; CHECK-NEXT: bx lr 459entry: 460 %0 = bitcast i16* %x to <16 x i16>* 461 %1 = load <16 x i16>, <16 x i16>* %0, align 2 462 %2 = sext <16 x i16> %1 to <16 x i32> 463 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 464 ret i32 %3 465} 466 467define i32 @addv24i32i16(i16* %x) { 468; CHECK-LABEL: addv24i32i16: 469; CHECK: @ %bb.0: @ %entry 470; CHECK-NEXT: vldrh.s32 q1, [r0] 471; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 472; CHECK-NEXT: vaddv.u32 r2, q1 473; CHECK-NEXT: vaddva.u32 r2, q0 474; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 475; CHECK-NEXT: vaddva.u32 r2, q0 476; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 477; CHECK-NEXT: vaddva.u32 r2, q0 478; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 479; CHECK-NEXT: vaddva.s16 r2, q0 480; CHECK-NEXT: mov r0, r2 481; CHECK-NEXT: bx lr 482entry: 483 %0 = bitcast i16* %x to <16 x i16>* 484 %1 = load <16 x i16>, <16 x i16>* %0, align 2 485 %2 = sext <16 x i16> %1 to <16 x i32> 486 %arrayidx.16 = getelementptr inbounds i16, i16* %x, i32 16 487 %3 = bitcast i16* %arrayidx.16 to <8 x i16>* 488 %4 = load <8 x i16>, <8 x i16>* %3, align 2 489 %5 = sext <8 x i16> %4 to <8 x i32> 490 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 491 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 492 %op.rdx = add nsw i32 %6, %7 493 ret i32 %op.rdx 494} 495 496define i32 @addv32i32i16(i16* %x) { 497; CHECK-LABEL: addv32i32i16: 498; CHECK: @ %bb.0: @ %entry 499; CHECK-NEXT: vldrh.s32 q1, [r0] 500; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 501; CHECK-NEXT: vaddv.u32 r2, q1 502; CHECK-NEXT: vaddva.u32 r2, q0 503; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 504; CHECK-NEXT: vaddva.u32 r2, q0 505; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 506; CHECK-NEXT: vaddva.u32 r2, q0 507; CHECK-NEXT: vldrh.s32 q0, [r0, #32] 508; CHECK-NEXT: vaddva.u32 r2, q0 509; CHECK-NEXT: vldrh.s32 q0, [r0, #40] 510; CHECK-NEXT: vaddva.u32 r2, q0 511; CHECK-NEXT: vldrh.s32 q0, [r0, #48] 512; CHECK-NEXT: vaddva.u32 r2, q0 513; CHECK-NEXT: vldrh.s32 q0, [r0, #56] 514; CHECK-NEXT: vaddva.u32 r2, q0 515; CHECK-NEXT: mov r0, r2 516; CHECK-NEXT: bx lr 517entry: 518 %0 = bitcast i16* %x to <32 x i16>* 519 %1 = load <32 x i16>, <32 x i16>* %0, align 2 520 %2 = sext <32 x i16> %1 to <32 x i32> 521 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) 522 ret i32 %3 523} 524 525define i32 @addv64i32i16(i16* %x) { 526; CHECK-LABEL: addv64i32i16: 527; CHECK: @ %bb.0: @ %entry 528; CHECK-NEXT: vldrh.s32 q1, [r0] 529; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 530; CHECK-NEXT: ldrsh.w r1, [r0, #120] 531; CHECK-NEXT: vaddv.u32 r2, q1 532; CHECK-NEXT: ldrsh.w r3, [r0, #122] 533; CHECK-NEXT: vaddva.u32 r2, q0 534; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 535; CHECK-NEXT: ldrsh.w r12, [r0, #124] 536; CHECK-NEXT: vaddva.u32 r2, q0 537; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 538; CHECK-NEXT: vaddva.u32 r2, q0 539; CHECK-NEXT: vldrh.s32 q0, [r0, #32] 540; CHECK-NEXT: vaddva.u32 r2, q0 541; CHECK-NEXT: vldrh.s32 q0, [r0, #40] 542; CHECK-NEXT: vaddva.u32 r2, q0 543; CHECK-NEXT: vldrh.s32 q0, [r0, #48] 544; CHECK-NEXT: vaddva.u32 r2, q0 545; CHECK-NEXT: vldrh.s32 q0, [r0, #56] 546; CHECK-NEXT: vaddva.u32 r2, q0 547; CHECK-NEXT: vldrh.s32 q0, [r0, #64] 548; CHECK-NEXT: vaddva.u32 r2, q0 549; CHECK-NEXT: vldrh.s32 q0, [r0, #72] 550; CHECK-NEXT: vaddva.u32 r2, q0 551; CHECK-NEXT: vldrh.s32 q0, [r0, #80] 552; CHECK-NEXT: vaddva.u32 r2, q0 553; CHECK-NEXT: vldrh.s32 q0, [r0, #88] 554; CHECK-NEXT: vaddva.u32 r2, q0 555; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 556; CHECK-NEXT: vaddva.s16 r2, q0 557; CHECK-NEXT: vldrh.s32 q0, [r0, #112] 558; CHECK-NEXT: ldrsh.w r0, [r0, #126] 559; CHECK-NEXT: vaddva.u32 r2, q0 560; CHECK-NEXT: add r1, r2 561; CHECK-NEXT: add r1, r3 562; CHECK-NEXT: add r1, r12 563; CHECK-NEXT: add r0, r1 564; CHECK-NEXT: bx lr 565entry: 566 %0 = bitcast i16* %x to <32 x i16>* 567 %1 = load <32 x i16>, <32 x i16>* %0, align 2 568 %2 = sext <32 x i16> %1 to <32 x i32> 569 %arrayidx.32 = getelementptr inbounds i16, i16* %x, i32 32 570 %3 = bitcast i16* %arrayidx.32 to <16 x i16>* 571 %4 = load <16 x i16>, <16 x i16>* %3, align 2 572 %5 = sext <16 x i16> %4 to <16 x i32> 573 %arrayidx.48 = getelementptr inbounds i16, i16* %x, i32 48 574 %6 = bitcast i16* %arrayidx.48 to <8 x i16>* 575 %7 = load <8 x i16>, <8 x i16>* %6, align 2 576 %8 = sext <8 x i16> %7 to <8 x i32> 577 %arrayidx.56 = getelementptr inbounds i16, i16* %x, i32 56 578 %9 = bitcast i16* %arrayidx.56 to <4 x i16>* 579 %10 = load <4 x i16>, <4 x i16>* %9, align 2 580 %11 = sext <4 x i16> %10 to <4 x i32> 581 %arrayidx.60 = getelementptr inbounds i16, i16* %x, i32 60 582 %12 = load i16, i16* %arrayidx.60, align 2 583 %conv.60 = sext i16 %12 to i32 584 %arrayidx.61 = getelementptr inbounds i16, i16* %x, i32 61 585 %13 = load i16, i16* %arrayidx.61, align 2 586 %conv.61 = sext i16 %13 to i32 587 %arrayidx.62 = getelementptr inbounds i16, i16* %x, i32 62 588 %14 = load i16, i16* %arrayidx.62, align 2 589 %conv.62 = sext i16 %14 to i32 590 %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) 591 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 592 %op.rdx = add nsw i32 %15, %16 593 %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) 594 %op.rdx8 = add nsw i32 %op.rdx, %17 595 %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11) 596 %op.rdx9 = add nsw i32 %op.rdx8, %18 597 %19 = add nsw i32 %op.rdx9, %conv.60 598 %20 = add nsw i32 %19, %conv.61 599 %21 = add nsw i32 %20, %conv.62 600 %arrayidx.63 = getelementptr inbounds i16, i16* %x, i32 63 601 %22 = load i16, i16* %arrayidx.63, align 2 602 %conv.63 = sext i16 %22 to i32 603 %add.63 = add nsw i32 %21, %conv.63 604 ret i32 %add.63 605} 606 607define i32 @addv128i32i16(i16* %x) { 608; CHECK-LABEL: addv128i32i16: 609; CHECK: @ %bb.0: @ %entry 610; CHECK-NEXT: vldrh.u16 q1, [r0] 611; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 612; CHECK-NEXT: vaddv.s16 r2, q1 613; CHECK-NEXT: vaddva.s16 r2, q0 614; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 615; CHECK-NEXT: vaddva.s16 r2, q0 616; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 617; CHECK-NEXT: vaddva.s16 r2, q0 618; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 619; CHECK-NEXT: vaddva.s16 r2, q0 620; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 621; CHECK-NEXT: vaddva.s16 r2, q0 622; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 623; CHECK-NEXT: vaddva.s16 r2, q0 624; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 625; CHECK-NEXT: vaddva.s16 r2, q0 626; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 627; CHECK-NEXT: vaddva.s16 r2, q0 628; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 629; CHECK-NEXT: vaddva.s16 r2, q0 630; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 631; CHECK-NEXT: vaddva.s16 r2, q0 632; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 633; CHECK-NEXT: vaddva.s16 r2, q0 634; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 635; CHECK-NEXT: vaddva.s16 r2, q0 636; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 637; CHECK-NEXT: vaddva.s16 r2, q0 638; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 639; CHECK-NEXT: vaddva.s16 r2, q0 640; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 641; CHECK-NEXT: vaddva.s16 r2, q0 642; CHECK-NEXT: mov r0, r2 643; CHECK-NEXT: bx lr 644entry: 645 %0 = bitcast i16* %x to <8 x i16>* 646 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 647 %1 = sext <8 x i16> %wide.load to <8 x i32> 648 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 649 %3 = getelementptr inbounds i16, i16* %x, i32 8 650 %4 = bitcast i16* %3 to <8 x i16>* 651 %wide.load.1 = load <8 x i16>, <8 x i16>* %4, align 2 652 %5 = sext <8 x i16> %wide.load.1 to <8 x i32> 653 %6 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 654 %7 = add i32 %6, %2 655 %8 = getelementptr inbounds i16, i16* %x, i32 16 656 %9 = bitcast i16* %8 to <8 x i16>* 657 %wide.load.2 = load <8 x i16>, <8 x i16>* %9, align 2 658 %10 = sext <8 x i16> %wide.load.2 to <8 x i32> 659 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %10) 660 %12 = add i32 %11, %7 661 %13 = getelementptr inbounds i16, i16* %x, i32 24 662 %14 = bitcast i16* %13 to <8 x i16>* 663 %wide.load.3 = load <8 x i16>, <8 x i16>* %14, align 2 664 %15 = sext <8 x i16> %wide.load.3 to <8 x i32> 665 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) 666 %17 = add i32 %16, %12 667 %18 = getelementptr inbounds i16, i16* %x, i32 32 668 %19 = bitcast i16* %18 to <8 x i16>* 669 %wide.load.4 = load <8 x i16>, <8 x i16>* %19, align 2 670 %20 = sext <8 x i16> %wide.load.4 to <8 x i32> 671 %21 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %20) 672 %22 = add i32 %21, %17 673 %23 = getelementptr inbounds i16, i16* %x, i32 40 674 %24 = bitcast i16* %23 to <8 x i16>* 675 %wide.load.5 = load <8 x i16>, <8 x i16>* %24, align 2 676 %25 = sext <8 x i16> %wide.load.5 to <8 x i32> 677 %26 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %25) 678 %27 = add i32 %26, %22 679 %28 = getelementptr inbounds i16, i16* %x, i32 48 680 %29 = bitcast i16* %28 to <8 x i16>* 681 %wide.load.6 = load <8 x i16>, <8 x i16>* %29, align 2 682 %30 = sext <8 x i16> %wide.load.6 to <8 x i32> 683 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30) 684 %32 = add i32 %31, %27 685 %33 = getelementptr inbounds i16, i16* %x, i32 56 686 %34 = bitcast i16* %33 to <8 x i16>* 687 %wide.load.7 = load <8 x i16>, <8 x i16>* %34, align 2 688 %35 = sext <8 x i16> %wide.load.7 to <8 x i32> 689 %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35) 690 %37 = add i32 %36, %32 691 %38 = getelementptr inbounds i16, i16* %x, i32 64 692 %39 = bitcast i16* %38 to <8 x i16>* 693 %wide.load.8 = load <8 x i16>, <8 x i16>* %39, align 2 694 %40 = sext <8 x i16> %wide.load.8 to <8 x i32> 695 %41 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40) 696 %42 = add i32 %41, %37 697 %43 = getelementptr inbounds i16, i16* %x, i32 72 698 %44 = bitcast i16* %43 to <8 x i16>* 699 %wide.load.9 = load <8 x i16>, <8 x i16>* %44, align 2 700 %45 = sext <8 x i16> %wide.load.9 to <8 x i32> 701 %46 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %45) 702 %47 = add i32 %46, %42 703 %48 = getelementptr inbounds i16, i16* %x, i32 80 704 %49 = bitcast i16* %48 to <8 x i16>* 705 %wide.load.10 = load <8 x i16>, <8 x i16>* %49, align 2 706 %50 = sext <8 x i16> %wide.load.10 to <8 x i32> 707 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) 708 %52 = add i32 %51, %47 709 %53 = getelementptr inbounds i16, i16* %x, i32 88 710 %54 = bitcast i16* %53 to <8 x i16>* 711 %wide.load.11 = load <8 x i16>, <8 x i16>* %54, align 2 712 %55 = sext <8 x i16> %wide.load.11 to <8 x i32> 713 %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55) 714 %57 = add i32 %56, %52 715 %58 = getelementptr inbounds i16, i16* %x, i32 96 716 %59 = bitcast i16* %58 to <8 x i16>* 717 %wide.load.12 = load <8 x i16>, <8 x i16>* %59, align 2 718 %60 = sext <8 x i16> %wide.load.12 to <8 x i32> 719 %61 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %60) 720 %62 = add i32 %61, %57 721 %63 = getelementptr inbounds i16, i16* %x, i32 104 722 %64 = bitcast i16* %63 to <8 x i16>* 723 %wide.load.13 = load <8 x i16>, <8 x i16>* %64, align 2 724 %65 = sext <8 x i16> %wide.load.13 to <8 x i32> 725 %66 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %65) 726 %67 = add i32 %66, %62 727 %68 = getelementptr inbounds i16, i16* %x, i32 112 728 %69 = bitcast i16* %68 to <8 x i16>* 729 %wide.load.14 = load <8 x i16>, <8 x i16>* %69, align 2 730 %70 = sext <8 x i16> %wide.load.14 to <8 x i32> 731 %71 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %70) 732 %72 = add i32 %71, %67 733 %73 = getelementptr inbounds i16, i16* %x, i32 120 734 %74 = bitcast i16* %73 to <8 x i16>* 735 %wide.load.15 = load <8 x i16>, <8 x i16>* %74, align 2 736 %75 = sext <8 x i16> %wide.load.15 to <8 x i32> 737 %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75) 738 %77 = add i32 %76, %72 739 ret i32 %77 740} 741 742define i32 @addv2i32i8(i8* %x) { 743; CHECK-LABEL: addv2i32i8: 744; CHECK: @ %bb.0: @ %entry 745; CHECK-NEXT: ldrb r1, [r0] 746; CHECK-NEXT: ldrb r0, [r0, #1] 747; CHECK-NEXT: add r0, r1 748; CHECK-NEXT: bx lr 749entry: 750 %0 = load i8, i8* %x, align 1 751 %conv = zext i8 %0 to i32 752 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1 753 %1 = load i8, i8* %arrayidx.1, align 1 754 %conv.1 = zext i8 %1 to i32 755 %add.1 = add nuw nsw i32 %conv, %conv.1 756 ret i32 %add.1 757} 758 759define i32 @addv4i32i8(i8* %x) { 760; CHECK-LABEL: addv4i32i8: 761; CHECK: @ %bb.0: @ %entry 762; CHECK-NEXT: vldrb.u32 q0, [r0] 763; CHECK-NEXT: vaddv.u32 r0, q0 764; CHECK-NEXT: bx lr 765entry: 766 %0 = bitcast i8* %x to <4 x i8>* 767 %1 = load <4 x i8>, <4 x i8>* %0, align 1 768 %2 = zext <4 x i8> %1 to <4 x i32> 769 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 770 ret i32 %3 771} 772 773define i32 @addv8i32i8(i8* %x) { 774; CHECK-LABEL: addv8i32i8: 775; CHECK: @ %bb.0: @ %entry 776; CHECK-NEXT: vldrb.u16 q0, [r0] 777; CHECK-NEXT: vaddv.u16 r0, q0 778; CHECK-NEXT: bx lr 779entry: 780 %0 = bitcast i8* %x to <8 x i8>* 781 %1 = load <8 x i8>, <8 x i8>* %0, align 1 782 %2 = zext <8 x i8> %1 to <8 x i32> 783 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 784 ret i32 %3 785} 786 787define i32 @addv16i32i8(i8* %x) { 788; CHECK-LABEL: addv16i32i8: 789; CHECK: @ %bb.0: @ %entry 790; CHECK-NEXT: vldrb.u8 q0, [r0] 791; CHECK-NEXT: vaddv.u8 r0, q0 792; CHECK-NEXT: bx lr 793entry: 794 %0 = bitcast i8* %x to <16 x i8>* 795 %1 = load <16 x i8>, <16 x i8>* %0, align 1 796 %2 = zext <16 x i8> %1 to <16 x i32> 797 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 798 ret i32 %3 799} 800 801define i32 @addv24i32i8(i8* %x) { 802; CHECK-LABEL: addv24i32i8: 803; CHECK: @ %bb.0: @ %entry 804; CHECK-NEXT: vldrb.u8 q1, [r0] 805; CHECK-NEXT: vldrb.u16 q0, [r0, #16] 806; CHECK-NEXT: vaddv.u8 r0, q1 807; CHECK-NEXT: vaddva.u16 r0, q0 808; CHECK-NEXT: bx lr 809entry: 810 %0 = bitcast i8* %x to <16 x i8>* 811 %1 = load <16 x i8>, <16 x i8>* %0, align 1 812 %2 = zext <16 x i8> %1 to <16 x i32> 813 %arrayidx.16 = getelementptr inbounds i8, i8* %x, i32 16 814 %3 = bitcast i8* %arrayidx.16 to <8 x i8>* 815 %4 = load <8 x i8>, <8 x i8>* %3, align 1 816 %5 = zext <8 x i8> %4 to <8 x i32> 817 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 818 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 819 %op.rdx = add nuw nsw i32 %6, %7 820 ret i32 %op.rdx 821} 822 823define i32 @addv32i32i8(i8* %x) { 824; CHECK-LABEL: addv32i32i8: 825; CHECK: @ %bb.0: @ %entry 826; CHECK-NEXT: vldrb.u32 q1, [r0] 827; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 828; CHECK-NEXT: vaddv.u32 r2, q1 829; CHECK-NEXT: vaddva.u32 r2, q0 830; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 831; CHECK-NEXT: vaddva.u32 r2, q0 832; CHECK-NEXT: vldrb.u32 q0, [r0, #12] 833; CHECK-NEXT: vaddva.u32 r2, q0 834; CHECK-NEXT: vldrb.u32 q0, [r0, #16] 835; CHECK-NEXT: vaddva.u32 r2, q0 836; CHECK-NEXT: vldrb.u32 q0, [r0, #20] 837; CHECK-NEXT: vaddva.u32 r2, q0 838; CHECK-NEXT: vldrb.u32 q0, [r0, #24] 839; CHECK-NEXT: vaddva.u32 r2, q0 840; CHECK-NEXT: vldrb.u32 q0, [r0, #28] 841; CHECK-NEXT: vaddva.u32 r2, q0 842; CHECK-NEXT: mov r0, r2 843; CHECK-NEXT: bx lr 844entry: 845 %0 = bitcast i8* %x to <32 x i8>* 846 %1 = load <32 x i8>, <32 x i8>* %0, align 1 847 %2 = zext <32 x i8> %1 to <32 x i32> 848 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) 849 ret i32 %3 850} 851 852define i32 @addv64i32i8(i8* %x) { 853; CHECK-LABEL: addv64i32i8: 854; CHECK: @ %bb.0: @ %entry 855; CHECK-NEXT: vldrb.u32 q1, [r0] 856; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 857; CHECK-NEXT: ldrb.w r1, [r0, #60] 858; CHECK-NEXT: vaddv.u32 r2, q1 859; CHECK-NEXT: ldrb.w r3, [r0, #61] 860; CHECK-NEXT: vaddva.u32 r2, q0 861; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 862; CHECK-NEXT: ldrb.w r12, [r0, #62] 863; CHECK-NEXT: vaddva.u32 r2, q0 864; CHECK-NEXT: vldrb.u32 q0, [r0, #12] 865; CHECK-NEXT: vaddva.u32 r2, q0 866; CHECK-NEXT: vldrb.u32 q0, [r0, #16] 867; CHECK-NEXT: vaddva.u32 r2, q0 868; CHECK-NEXT: vldrb.u32 q0, [r0, #20] 869; CHECK-NEXT: vaddva.u32 r2, q0 870; CHECK-NEXT: vldrb.u32 q0, [r0, #24] 871; CHECK-NEXT: vaddva.u32 r2, q0 872; CHECK-NEXT: vldrb.u32 q0, [r0, #28] 873; CHECK-NEXT: vaddva.u32 r2, q0 874; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 875; CHECK-NEXT: vaddva.u8 r2, q0 876; CHECK-NEXT: vldrb.u16 q0, [r0, #48] 877; CHECK-NEXT: vaddva.u16 r2, q0 878; CHECK-NEXT: vldrb.u32 q0, [r0, #56] 879; CHECK-NEXT: ldrb.w r0, [r0, #63] 880; CHECK-NEXT: vaddva.u32 r2, q0 881; CHECK-NEXT: add r1, r2 882; CHECK-NEXT: add r1, r3 883; CHECK-NEXT: add r1, r12 884; CHECK-NEXT: add r0, r1 885; CHECK-NEXT: bx lr 886entry: 887 %0 = bitcast i8* %x to <32 x i8>* 888 %1 = load <32 x i8>, <32 x i8>* %0, align 1 889 %2 = zext <32 x i8> %1 to <32 x i32> 890 %arrayidx.32 = getelementptr inbounds i8, i8* %x, i32 32 891 %3 = bitcast i8* %arrayidx.32 to <16 x i8>* 892 %4 = load <16 x i8>, <16 x i8>* %3, align 1 893 %5 = zext <16 x i8> %4 to <16 x i32> 894 %arrayidx.48 = getelementptr inbounds i8, i8* %x, i32 48 895 %6 = bitcast i8* %arrayidx.48 to <8 x i8>* 896 %7 = load <8 x i8>, <8 x i8>* %6, align 1 897 %8 = zext <8 x i8> %7 to <8 x i32> 898 %arrayidx.56 = getelementptr inbounds i8, i8* %x, i32 56 899 %9 = bitcast i8* %arrayidx.56 to <4 x i8>* 900 %10 = load <4 x i8>, <4 x i8>* %9, align 1 901 %11 = zext <4 x i8> %10 to <4 x i32> 902 %arrayidx.60 = getelementptr inbounds i8, i8* %x, i32 60 903 %12 = load i8, i8* %arrayidx.60, align 1 904 %conv.60 = zext i8 %12 to i32 905 %arrayidx.61 = getelementptr inbounds i8, i8* %x, i32 61 906 %13 = load i8, i8* %arrayidx.61, align 1 907 %conv.61 = zext i8 %13 to i32 908 %arrayidx.62 = getelementptr inbounds i8, i8* %x, i32 62 909 %14 = load i8, i8* %arrayidx.62, align 1 910 %conv.62 = zext i8 %14 to i32 911 %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) 912 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 913 %op.rdx = add nuw nsw i32 %15, %16 914 %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) 915 %op.rdx8 = add nuw nsw i32 %op.rdx, %17 916 %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11) 917 %op.rdx9 = add nuw nsw i32 %op.rdx8, %18 918 %19 = add nuw nsw i32 %op.rdx9, %conv.60 919 %20 = add nuw nsw i32 %19, %conv.61 920 %21 = add nuw nsw i32 %20, %conv.62 921 %arrayidx.63 = getelementptr inbounds i8, i8* %x, i32 63 922 %22 = load i8, i8* %arrayidx.63, align 1 923 %conv.63 = zext i8 %22 to i32 924 %add.63 = add nuw nsw i32 %21, %conv.63 925 ret i32 %add.63 926} 927 928define i32 @addv128i32i8(i8* %x) { 929; CHECK-LABEL: addv128i32i8: 930; CHECK: @ %bb.0: @ %entry 931; CHECK-NEXT: vldrb.u8 q1, [r0] 932; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 933; CHECK-NEXT: mov r1, r0 934; CHECK-NEXT: vaddv.u8 r0, q1 935; CHECK-NEXT: vaddva.u8 r0, q0 936; CHECK-NEXT: vldrb.u8 q0, [r1, #32] 937; CHECK-NEXT: vaddva.u8 r0, q0 938; CHECK-NEXT: vldrb.u8 q0, [r1, #48] 939; CHECK-NEXT: vaddva.u8 r0, q0 940; CHECK-NEXT: vldrb.u8 q0, [r1, #64] 941; CHECK-NEXT: vaddva.u8 r0, q0 942; CHECK-NEXT: vldrb.u8 q0, [r1, #80] 943; CHECK-NEXT: vaddva.u8 r0, q0 944; CHECK-NEXT: vldrb.u8 q0, [r1, #96] 945; CHECK-NEXT: vaddva.u8 r0, q0 946; CHECK-NEXT: vldrb.u8 q0, [r1, #112] 947; CHECK-NEXT: vaddva.u8 r0, q0 948; CHECK-NEXT: bx lr 949entry: 950 %0 = bitcast i8* %x to <16 x i8>* 951 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 952 %1 = zext <16 x i8> %wide.load to <16 x i32> 953 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 954 %3 = getelementptr inbounds i8, i8* %x, i32 16 955 %4 = bitcast i8* %3 to <16 x i8>* 956 %wide.load.1 = load <16 x i8>, <16 x i8>* %4, align 1 957 %5 = zext <16 x i8> %wide.load.1 to <16 x i32> 958 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 959 %7 = add i32 %6, %2 960 %8 = getelementptr inbounds i8, i8* %x, i32 32 961 %9 = bitcast i8* %8 to <16 x i8>* 962 %wide.load.2 = load <16 x i8>, <16 x i8>* %9, align 1 963 %10 = zext <16 x i8> %wide.load.2 to <16 x i32> 964 %11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10) 965 %12 = add i32 %11, %7 966 %13 = getelementptr inbounds i8, i8* %x, i32 48 967 %14 = bitcast i8* %13 to <16 x i8>* 968 %wide.load.3 = load <16 x i8>, <16 x i8>* %14, align 1 969 %15 = zext <16 x i8> %wide.load.3 to <16 x i32> 970 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) 971 %17 = add i32 %16, %12 972 %18 = getelementptr inbounds i8, i8* %x, i32 64 973 %19 = bitcast i8* %18 to <16 x i8>* 974 %wide.load.4 = load <16 x i8>, <16 x i8>* %19, align 1 975 %20 = zext <16 x i8> %wide.load.4 to <16 x i32> 976 %21 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %20) 977 %22 = add i32 %21, %17 978 %23 = getelementptr inbounds i8, i8* %x, i32 80 979 %24 = bitcast i8* %23 to <16 x i8>* 980 %wide.load.5 = load <16 x i8>, <16 x i8>* %24, align 1 981 %25 = zext <16 x i8> %wide.load.5 to <16 x i32> 982 %26 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %25) 983 %27 = add i32 %26, %22 984 %28 = getelementptr inbounds i8, i8* %x, i32 96 985 %29 = bitcast i8* %28 to <16 x i8>* 986 %wide.load.6 = load <16 x i8>, <16 x i8>* %29, align 1 987 %30 = zext <16 x i8> %wide.load.6 to <16 x i32> 988 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30) 989 %32 = add i32 %31, %27 990 %33 = getelementptr inbounds i8, i8* %x, i32 112 991 %34 = bitcast i8* %33 to <16 x i8>* 992 %wide.load.7 = load <16 x i8>, <16 x i8>* %34, align 1 993 %35 = zext <16 x i8> %wide.load.7 to <16 x i32> 994 %36 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %35) 995 %37 = add i32 %36, %32 996 ret i32 %37 997} 998 999define signext i16 @addv2i16i16(i16* %x) { 1000; CHECK-LABEL: addv2i16i16: 1001; CHECK: @ %bb.0: @ %entry 1002; CHECK-NEXT: ldrh r1, [r0] 1003; CHECK-NEXT: ldrh r0, [r0, #2] 1004; CHECK-NEXT: add r0, r1 1005; CHECK-NEXT: sxth r0, r0 1006; CHECK-NEXT: bx lr 1007entry: 1008 %0 = load i16, i16* %x, align 2 1009 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1 1010 %1 = load i16, i16* %arrayidx.1, align 2 1011 %add.1 = add i16 %1, %0 1012 ret i16 %add.1 1013} 1014 1015define signext i16 @addv4i16i16(i16* %x) { 1016; CHECK-LABEL: addv4i16i16: 1017; CHECK: @ %bb.0: @ %entry 1018; CHECK-NEXT: vldrh.u32 q0, [r0] 1019; CHECK-NEXT: vaddv.u32 r0, q0 1020; CHECK-NEXT: sxth r0, r0 1021; CHECK-NEXT: bx lr 1022entry: 1023 %0 = bitcast i16* %x to <4 x i16>* 1024 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1025 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1) 1026 ret i16 %2 1027} 1028 1029define signext i16 @addv8i16i16(i16* %x) { 1030; CHECK-LABEL: addv8i16i16: 1031; CHECK: @ %bb.0: @ %entry 1032; CHECK-NEXT: vldrh.u16 q0, [r0] 1033; CHECK-NEXT: vaddv.u16 r0, q0 1034; CHECK-NEXT: sxth r0, r0 1035; CHECK-NEXT: bx lr 1036entry: 1037 %0 = bitcast i16* %x to <8 x i16>* 1038 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1039 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) 1040 ret i16 %2 1041} 1042 1043define signext i16 @addv16i16i16(i16* %x) { 1044; CHECK-LABEL: addv16i16i16: 1045; CHECK: @ %bb.0: @ %entry 1046; CHECK-NEXT: vldrh.u16 q1, [r0] 1047; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1048; CHECK-NEXT: vaddv.u16 r0, q1 1049; CHECK-NEXT: vaddva.u16 r0, q0 1050; CHECK-NEXT: sxth r0, r0 1051; CHECK-NEXT: bx lr 1052entry: 1053 %0 = bitcast i16* %x to <16 x i16>* 1054 %1 = load <16 x i16>, <16 x i16>* %0, align 2 1055 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) 1056 ret i16 %2 1057} 1058 1059define signext i16 @addv24i16i16(i16* %x) { 1060; CHECK-LABEL: addv24i16i16: 1061; CHECK: @ %bb.0: @ %entry 1062; CHECK-NEXT: vldrh.u16 q1, [r0] 1063; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1064; CHECK-NEXT: vaddv.u16 r2, q1 1065; CHECK-NEXT: vaddva.u16 r2, q0 1066; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1067; CHECK-NEXT: vaddva.u16 r2, q0 1068; CHECK-NEXT: sxth r0, r2 1069; CHECK-NEXT: bx lr 1070entry: 1071 %0 = bitcast i16* %x to <8 x i16>* 1072 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1073 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8 1074 %2 = bitcast i16* %arrayidx.8 to <16 x i16>* 1075 %3 = load <16 x i16>, <16 x i16>* %2, align 2 1076 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3) 1077 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) 1078 %op.rdx = add i16 %4, %5 1079 ret i16 %op.rdx 1080} 1081 1082define signext i16 @addv32i16i16(i16* %x) { 1083; CHECK-LABEL: addv32i16i16: 1084; CHECK: @ %bb.0: @ %entry 1085; CHECK-NEXT: vldrh.u16 q1, [r0] 1086; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1087; CHECK-NEXT: vaddv.u16 r2, q1 1088; CHECK-NEXT: vaddva.u16 r2, q0 1089; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1090; CHECK-NEXT: vaddva.u16 r2, q0 1091; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1092; CHECK-NEXT: vaddva.u16 r2, q0 1093; CHECK-NEXT: sxth r0, r2 1094; CHECK-NEXT: bx lr 1095entry: 1096 %0 = bitcast i16* %x to <32 x i16>* 1097 %1 = load <32 x i16>, <32 x i16>* %0, align 2 1098 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1) 1099 ret i16 %2 1100} 1101 1102define signext i16 @addv64i16i16(i16* %x) { 1103; CHECK-LABEL: addv64i16i16: 1104; CHECK: @ %bb.0: @ %entry 1105; CHECK-NEXT: vldrh.u16 q1, [r0] 1106; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1107; CHECK-NEXT: vaddv.u16 r2, q1 1108; CHECK-NEXT: vaddva.u16 r2, q0 1109; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1110; CHECK-NEXT: vaddva.u16 r2, q0 1111; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1112; CHECK-NEXT: vaddva.u16 r2, q0 1113; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 1114; CHECK-NEXT: vaddva.u16 r2, q0 1115; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 1116; CHECK-NEXT: vaddva.u16 r2, q0 1117; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 1118; CHECK-NEXT: vaddva.u16 r2, q0 1119; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 1120; CHECK-NEXT: vaddva.u16 r2, q0 1121; CHECK-NEXT: sxth r0, r2 1122; CHECK-NEXT: bx lr 1123entry: 1124 %0 = bitcast i16* %x to <64 x i16>* 1125 %1 = load <64 x i16>, <64 x i16>* %0, align 2 1126 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1) 1127 ret i16 %2 1128} 1129 1130define signext i16 @addv128i16i16(i16* %x) { 1131; CHECK-LABEL: addv128i16i16: 1132; CHECK: @ %bb.0: @ %entry 1133; CHECK-NEXT: vldrh.u16 q1, [r0] 1134; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1135; CHECK-NEXT: vaddv.u16 r2, q1 1136; CHECK-NEXT: vaddva.u16 r2, q0 1137; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1138; CHECK-NEXT: vaddva.u16 r2, q0 1139; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1140; CHECK-NEXT: vaddva.u16 r2, q0 1141; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 1142; CHECK-NEXT: vaddva.u16 r2, q0 1143; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 1144; CHECK-NEXT: vaddva.u16 r2, q0 1145; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 1146; CHECK-NEXT: vaddva.u16 r2, q0 1147; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 1148; CHECK-NEXT: vaddva.u16 r2, q0 1149; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 1150; CHECK-NEXT: vaddva.u16 r2, q0 1151; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 1152; CHECK-NEXT: vaddva.u16 r2, q0 1153; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 1154; CHECK-NEXT: vaddva.u16 r2, q0 1155; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 1156; CHECK-NEXT: vaddva.u16 r2, q0 1157; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 1158; CHECK-NEXT: vaddva.u16 r2, q0 1159; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 1160; CHECK-NEXT: vaddva.u16 r2, q0 1161; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 1162; CHECK-NEXT: vaddva.u16 r2, q0 1163; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 1164; CHECK-NEXT: vaddva.u16 r2, q0 1165; CHECK-NEXT: sxth r0, r2 1166; CHECK-NEXT: bx lr 1167entry: 1168 %0 = bitcast i16* %x to <8 x i16>* 1169 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 1170 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load) 1171 %2 = getelementptr inbounds i16, i16* %x, i32 8 1172 %3 = bitcast i16* %2 to <8 x i16>* 1173 %wide.load.1 = load <8 x i16>, <8 x i16>* %3, align 2 1174 %4 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1) 1175 %5 = add i16 %4, %1 1176 %6 = getelementptr inbounds i16, i16* %x, i32 16 1177 %7 = bitcast i16* %6 to <8 x i16>* 1178 %wide.load.2 = load <8 x i16>, <8 x i16>* %7, align 2 1179 %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2) 1180 %9 = add i16 %8, %5 1181 %10 = getelementptr inbounds i16, i16* %x, i32 24 1182 %11 = bitcast i16* %10 to <8 x i16>* 1183 %wide.load.3 = load <8 x i16>, <8 x i16>* %11, align 2 1184 %12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3) 1185 %13 = add i16 %12, %9 1186 %14 = getelementptr inbounds i16, i16* %x, i32 32 1187 %15 = bitcast i16* %14 to <8 x i16>* 1188 %wide.load.4 = load <8 x i16>, <8 x i16>* %15, align 2 1189 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4) 1190 %17 = add i16 %16, %13 1191 %18 = getelementptr inbounds i16, i16* %x, i32 40 1192 %19 = bitcast i16* %18 to <8 x i16>* 1193 %wide.load.5 = load <8 x i16>, <8 x i16>* %19, align 2 1194 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5) 1195 %21 = add i16 %20, %17 1196 %22 = getelementptr inbounds i16, i16* %x, i32 48 1197 %23 = bitcast i16* %22 to <8 x i16>* 1198 %wide.load.6 = load <8 x i16>, <8 x i16>* %23, align 2 1199 %24 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6) 1200 %25 = add i16 %24, %21 1201 %26 = getelementptr inbounds i16, i16* %x, i32 56 1202 %27 = bitcast i16* %26 to <8 x i16>* 1203 %wide.load.7 = load <8 x i16>, <8 x i16>* %27, align 2 1204 %28 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7) 1205 %29 = add i16 %28, %25 1206 %30 = getelementptr inbounds i16, i16* %x, i32 64 1207 %31 = bitcast i16* %30 to <8 x i16>* 1208 %wide.load.8 = load <8 x i16>, <8 x i16>* %31, align 2 1209 %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8) 1210 %33 = add i16 %32, %29 1211 %34 = getelementptr inbounds i16, i16* %x, i32 72 1212 %35 = bitcast i16* %34 to <8 x i16>* 1213 %wide.load.9 = load <8 x i16>, <8 x i16>* %35, align 2 1214 %36 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9) 1215 %37 = add i16 %36, %33 1216 %38 = getelementptr inbounds i16, i16* %x, i32 80 1217 %39 = bitcast i16* %38 to <8 x i16>* 1218 %wide.load.10 = load <8 x i16>, <8 x i16>* %39, align 2 1219 %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10) 1220 %41 = add i16 %40, %37 1221 %42 = getelementptr inbounds i16, i16* %x, i32 88 1222 %43 = bitcast i16* %42 to <8 x i16>* 1223 %wide.load.11 = load <8 x i16>, <8 x i16>* %43, align 2 1224 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11) 1225 %45 = add i16 %44, %41 1226 %46 = getelementptr inbounds i16, i16* %x, i32 96 1227 %47 = bitcast i16* %46 to <8 x i16>* 1228 %wide.load.12 = load <8 x i16>, <8 x i16>* %47, align 2 1229 %48 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12) 1230 %49 = add i16 %48, %45 1231 %50 = getelementptr inbounds i16, i16* %x, i32 104 1232 %51 = bitcast i16* %50 to <8 x i16>* 1233 %wide.load.13 = load <8 x i16>, <8 x i16>* %51, align 2 1234 %52 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13) 1235 %53 = add i16 %52, %49 1236 %54 = getelementptr inbounds i16, i16* %x, i32 112 1237 %55 = bitcast i16* %54 to <8 x i16>* 1238 %wide.load.14 = load <8 x i16>, <8 x i16>* %55, align 2 1239 %56 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14) 1240 %57 = add i16 %56, %53 1241 %58 = getelementptr inbounds i16, i16* %x, i32 120 1242 %59 = bitcast i16* %58 to <8 x i16>* 1243 %wide.load.15 = load <8 x i16>, <8 x i16>* %59, align 2 1244 %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15) 1245 %61 = add i16 %60, %57 1246 ret i16 %61 1247} 1248 1249define zeroext i8 @addv2i8i8(i8* %x) { 1250; CHECK-LABEL: addv2i8i8: 1251; CHECK: @ %bb.0: @ %entry 1252; CHECK-NEXT: ldrb r1, [r0] 1253; CHECK-NEXT: ldrb r0, [r0, #1] 1254; CHECK-NEXT: add r0, r1 1255; CHECK-NEXT: uxtb r0, r0 1256; CHECK-NEXT: bx lr 1257entry: 1258 %0 = load i8, i8* %x, align 1 1259 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1 1260 %1 = load i8, i8* %arrayidx.1, align 1 1261 %add.1 = add i8 %1, %0 1262 ret i8 %add.1 1263} 1264 1265define zeroext i8 @addv4i8i8(i8* %x) { 1266; CHECK-LABEL: addv4i8i8: 1267; CHECK: @ %bb.0: @ %entry 1268; CHECK-NEXT: vldrb.u32 q0, [r0] 1269; CHECK-NEXT: vaddv.u32 r0, q0 1270; CHECK-NEXT: uxtb r0, r0 1271; CHECK-NEXT: bx lr 1272entry: 1273 %0 = bitcast i8* %x to <4 x i8>* 1274 %1 = load <4 x i8>, <4 x i8>* %0, align 1 1275 %2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %1) 1276 ret i8 %2 1277} 1278 1279define zeroext i8 @addv8i8i8(i8* %x) { 1280; CHECK-LABEL: addv8i8i8: 1281; CHECK: @ %bb.0: @ %entry 1282; CHECK-NEXT: vldrb.u16 q0, [r0] 1283; CHECK-NEXT: vaddv.u16 r0, q0 1284; CHECK-NEXT: uxtb r0, r0 1285; CHECK-NEXT: bx lr 1286entry: 1287 %0 = bitcast i8* %x to <8 x i8>* 1288 %1 = load <8 x i8>, <8 x i8>* %0, align 1 1289 %2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1) 1290 ret i8 %2 1291} 1292 1293define zeroext i8 @addv16i8i8(i8* %x) { 1294; CHECK-LABEL: addv16i8i8: 1295; CHECK: @ %bb.0: @ %entry 1296; CHECK-NEXT: vldrb.u8 q0, [r0] 1297; CHECK-NEXT: vaddv.u8 r0, q0 1298; CHECK-NEXT: uxtb r0, r0 1299; CHECK-NEXT: bx lr 1300entry: 1301 %0 = bitcast i8* %x to <16 x i8>* 1302 %1 = load <16 x i8>, <16 x i8>* %0, align 1 1303 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1) 1304 ret i8 %2 1305} 1306 1307define zeroext i8 @addv24i8i8(i8* %x) { 1308; CHECK-LABEL: addv24i8i8: 1309; CHECK: @ %bb.0: @ %entry 1310; CHECK-NEXT: vldrb.u16 q1, [r0] 1311; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 1312; CHECK-NEXT: vaddv.u16 r0, q1 1313; CHECK-NEXT: vaddva.u8 r0, q0 1314; CHECK-NEXT: uxtb r0, r0 1315; CHECK-NEXT: bx lr 1316entry: 1317 %0 = bitcast i8* %x to <8 x i8>* 1318 %1 = load <8 x i8>, <8 x i8>* %0, align 1 1319 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8 1320 %2 = bitcast i8* %arrayidx.8 to <16 x i8>* 1321 %3 = load <16 x i8>, <16 x i8>* %2, align 1 1322 %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %3) 1323 %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1) 1324 %op.rdx = add i8 %4, %5 1325 ret i8 %op.rdx 1326} 1327 1328define zeroext i8 @addv32i8i8(i8* %x) { 1329; CHECK-LABEL: addv32i8i8: 1330; CHECK: @ %bb.0: @ %entry 1331; CHECK-NEXT: vldrb.u8 q1, [r0] 1332; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1333; CHECK-NEXT: vaddv.u8 r0, q1 1334; CHECK-NEXT: vaddva.u8 r0, q0 1335; CHECK-NEXT: uxtb r0, r0 1336; CHECK-NEXT: bx lr 1337entry: 1338 %0 = bitcast i8* %x to <32 x i8>* 1339 %1 = load <32 x i8>, <32 x i8>* %0, align 1 1340 %2 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %1) 1341 ret i8 %2 1342} 1343 1344define zeroext i8 @addv64i8i8(i8* %x) { 1345; CHECK-LABEL: addv64i8i8: 1346; CHECK: @ %bb.0: @ %entry 1347; CHECK-NEXT: vldrb.u8 q1, [r0] 1348; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1349; CHECK-NEXT: vaddv.u8 r2, q1 1350; CHECK-NEXT: vaddva.u8 r2, q0 1351; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 1352; CHECK-NEXT: vaddva.u8 r2, q0 1353; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 1354; CHECK-NEXT: vaddva.u8 r2, q0 1355; CHECK-NEXT: uxtb r0, r2 1356; CHECK-NEXT: bx lr 1357entry: 1358 %0 = bitcast i8* %x to <64 x i8>* 1359 %1 = load <64 x i8>, <64 x i8>* %0, align 1 1360 %2 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %1) 1361 ret i8 %2 1362} 1363 1364define zeroext i8 @addv128i8i8(i8* %x) { 1365; CHECK-LABEL: addv128i8i8: 1366; CHECK: @ %bb.0: @ %entry 1367; CHECK-NEXT: vldrb.u8 q1, [r0] 1368; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1369; CHECK-NEXT: vaddv.u8 r2, q1 1370; CHECK-NEXT: vaddva.u8 r2, q0 1371; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 1372; CHECK-NEXT: vaddva.u8 r2, q0 1373; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 1374; CHECK-NEXT: vaddva.u8 r2, q0 1375; CHECK-NEXT: vldrb.u8 q0, [r0, #64] 1376; CHECK-NEXT: vaddva.u8 r2, q0 1377; CHECK-NEXT: vldrb.u8 q0, [r0, #80] 1378; CHECK-NEXT: vaddva.u8 r2, q0 1379; CHECK-NEXT: vldrb.u8 q0, [r0, #96] 1380; CHECK-NEXT: vaddva.u8 r2, q0 1381; CHECK-NEXT: vldrb.u8 q0, [r0, #112] 1382; CHECK-NEXT: vaddva.u8 r2, q0 1383; CHECK-NEXT: uxtb r0, r2 1384; CHECK-NEXT: bx lr 1385entry: 1386 %0 = bitcast i8* %x to <16 x i8>* 1387 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 1388 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load) 1389 %2 = getelementptr inbounds i8, i8* %x, i32 16 1390 %3 = bitcast i8* %2 to <16 x i8>* 1391 %wide.load.1 = load <16 x i8>, <16 x i8>* %3, align 1 1392 %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1) 1393 %5 = add i8 %4, %1 1394 %6 = getelementptr inbounds i8, i8* %x, i32 32 1395 %7 = bitcast i8* %6 to <16 x i8>* 1396 %wide.load.2 = load <16 x i8>, <16 x i8>* %7, align 1 1397 %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2) 1398 %9 = add i8 %8, %5 1399 %10 = getelementptr inbounds i8, i8* %x, i32 48 1400 %11 = bitcast i8* %10 to <16 x i8>* 1401 %wide.load.3 = load <16 x i8>, <16 x i8>* %11, align 1 1402 %12 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3) 1403 %13 = add i8 %12, %9 1404 %14 = getelementptr inbounds i8, i8* %x, i32 64 1405 %15 = bitcast i8* %14 to <16 x i8>* 1406 %wide.load.4 = load <16 x i8>, <16 x i8>* %15, align 1 1407 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4) 1408 %17 = add i8 %16, %13 1409 %18 = getelementptr inbounds i8, i8* %x, i32 80 1410 %19 = bitcast i8* %18 to <16 x i8>* 1411 %wide.load.5 = load <16 x i8>, <16 x i8>* %19, align 1 1412 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5) 1413 %21 = add i8 %20, %17 1414 %22 = getelementptr inbounds i8, i8* %x, i32 96 1415 %23 = bitcast i8* %22 to <16 x i8>* 1416 %wide.load.6 = load <16 x i8>, <16 x i8>* %23, align 1 1417 %24 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6) 1418 %25 = add i8 %24, %21 1419 %26 = getelementptr inbounds i8, i8* %x, i32 112 1420 %27 = bitcast i8* %26 to <16 x i8>* 1421 %wide.load.7 = load <16 x i8>, <16 x i8>* %27, align 1 1422 %28 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7) 1423 %29 = add i8 %28, %25 1424 ret i8 %29 1425} 1426 1427 1428 1429define i32 @mlav2i32i32(i32* %x, i32* %y) { 1430; CHECK-LABEL: mlav2i32i32: 1431; CHECK: @ %bb.0: @ %entry 1432; CHECK-NEXT: ldrd r2, r0, [r0] 1433; CHECK-NEXT: ldrd r3, r1, [r1] 1434; CHECK-NEXT: muls r2, r3, r2 1435; CHECK-NEXT: mla r0, r1, r0, r2 1436; CHECK-NEXT: bx lr 1437entry: 1438 %0 = load i32, i32* %x, align 4 1439 %1 = load i32, i32* %y, align 4 1440 %mul = mul nsw i32 %1, %0 1441 %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1 1442 %2 = load i32, i32* %arrayidx.1, align 4 1443 %arrayidx1.1 = getelementptr inbounds i32, i32* %y, i32 1 1444 %3 = load i32, i32* %arrayidx1.1, align 4 1445 %mul.1 = mul nsw i32 %3, %2 1446 %add.1 = add nsw i32 %mul.1, %mul 1447 ret i32 %add.1 1448} 1449 1450define i32 @mlav4i32i32(i32* %x, i32* %y) { 1451; CHECK-LABEL: mlav4i32i32: 1452; CHECK: @ %bb.0: @ %entry 1453; CHECK-NEXT: vldrw.u32 q0, [r0] 1454; CHECK-NEXT: vldrw.u32 q1, [r1] 1455; CHECK-NEXT: vmlav.u32 r0, q1, q0 1456; CHECK-NEXT: bx lr 1457entry: 1458 %0 = bitcast i32* %x to <4 x i32>* 1459 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1460 %2 = bitcast i32* %y to <4 x i32>* 1461 %3 = load <4 x i32>, <4 x i32>* %2, align 4 1462 %4 = mul nsw <4 x i32> %3, %1 1463 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 1464 ret i32 %5 1465} 1466 1467define i32 @mlav8i32i32(i32* %x, i32* %y) { 1468; CHECK-LABEL: mlav8i32i32: 1469; CHECK: @ %bb.0: @ %entry 1470; CHECK-NEXT: vldrw.u32 q0, [r0] 1471; CHECK-NEXT: vldrw.u32 q1, [r1] 1472; CHECK-NEXT: vmlav.u32 r2, q1, q0 1473; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1474; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1475; CHECK-NEXT: vmlava.u32 r2, q1, q0 1476; CHECK-NEXT: mov r0, r2 1477; CHECK-NEXT: bx lr 1478entry: 1479 %0 = bitcast i32* %x to <8 x i32>* 1480 %1 = load <8 x i32>, <8 x i32>* %0, align 4 1481 %2 = bitcast i32* %y to <8 x i32>* 1482 %3 = load <8 x i32>, <8 x i32>* %2, align 4 1483 %4 = mul nsw <8 x i32> %3, %1 1484 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 1485 ret i32 %5 1486} 1487 1488define i32 @mlav16i32i32(i32* %x, i32* %y) { 1489; CHECK-LABEL: mlav16i32i32: 1490; CHECK: @ %bb.0: @ %entry 1491; CHECK-NEXT: vldrw.u32 q0, [r0] 1492; CHECK-NEXT: vldrw.u32 q1, [r1] 1493; CHECK-NEXT: vmlav.u32 r2, q1, q0 1494; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1495; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1496; CHECK-NEXT: vmlava.u32 r2, q1, q0 1497; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1498; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1499; CHECK-NEXT: vmlava.u32 r2, q1, q0 1500; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 1501; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1502; CHECK-NEXT: vmlava.u32 r2, q1, q0 1503; CHECK-NEXT: mov r0, r2 1504; CHECK-NEXT: bx lr 1505entry: 1506 %0 = bitcast i32* %x to <16 x i32>* 1507 %1 = load <16 x i32>, <16 x i32>* %0, align 4 1508 %2 = bitcast i32* %y to <16 x i32>* 1509 %3 = load <16 x i32>, <16 x i32>* %2, align 4 1510 %4 = mul nsw <16 x i32> %3, %1 1511 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 1512 ret i32 %5 1513} 1514 1515define i32 @mlav24i32i32(i32* %x, i32* %y) { 1516; CHECK-LABEL: mlav24i32i32: 1517; CHECK: @ %bb.0: @ %entry 1518; CHECK-NEXT: vldrw.u32 q0, [r0] 1519; CHECK-NEXT: vldrw.u32 q1, [r1] 1520; CHECK-NEXT: mov r2, r0 1521; CHECK-NEXT: vmlav.u32 r0, q1, q0 1522; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1523; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1524; CHECK-NEXT: vmlava.u32 r0, q1, q0 1525; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1526; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1527; CHECK-NEXT: vmlava.u32 r0, q1, q0 1528; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1529; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1530; CHECK-NEXT: vmlava.u32 r0, q1, q0 1531; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1532; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1533; CHECK-NEXT: vmlava.u32 r0, q1, q0 1534; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1535; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1536; CHECK-NEXT: vmlava.u32 r0, q1, q0 1537; CHECK-NEXT: bx lr 1538entry: 1539 %0 = bitcast i32* %x to <8 x i32>* 1540 %1 = load <8 x i32>, <8 x i32>* %0, align 4 1541 %2 = bitcast i32* %y to <8 x i32>* 1542 %3 = load <8 x i32>, <8 x i32>* %2, align 4 1543 %4 = mul nsw <8 x i32> %3, %1 1544 %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8 1545 %arrayidx1.8 = getelementptr inbounds i32, i32* %y, i32 8 1546 %5 = bitcast i32* %arrayidx.8 to <16 x i32>* 1547 %6 = load <16 x i32>, <16 x i32>* %5, align 4 1548 %7 = bitcast i32* %arrayidx1.8 to <16 x i32>* 1549 %8 = load <16 x i32>, <16 x i32>* %7, align 4 1550 %9 = mul nsw <16 x i32> %8, %6 1551 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) 1552 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 1553 %op.rdx = add nsw i32 %10, %11 1554 ret i32 %op.rdx 1555} 1556 1557define i32 @mlav32i32i32(i32* %x, i32* %y) { 1558; CHECK-LABEL: mlav32i32i32: 1559; CHECK: @ %bb.0: @ %entry 1560; CHECK-NEXT: vldrw.u32 q0, [r0] 1561; CHECK-NEXT: vldrw.u32 q1, [r1] 1562; CHECK-NEXT: mov r2, r0 1563; CHECK-NEXT: vmlav.u32 r0, q1, q0 1564; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1565; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1566; CHECK-NEXT: vmlava.u32 r0, q1, q0 1567; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1568; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1569; CHECK-NEXT: vmlava.u32 r0, q1, q0 1570; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1571; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1572; CHECK-NEXT: vmlava.u32 r0, q1, q0 1573; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1574; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1575; CHECK-NEXT: vmlava.u32 r0, q1, q0 1576; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1577; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1578; CHECK-NEXT: vmlava.u32 r0, q1, q0 1579; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1580; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1581; CHECK-NEXT: vmlava.u32 r0, q1, q0 1582; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1583; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1584; CHECK-NEXT: vmlava.u32 r0, q1, q0 1585; CHECK-NEXT: bx lr 1586entry: 1587 %0 = bitcast i32* %x to <32 x i32>* 1588 %1 = load <32 x i32>, <32 x i32>* %0, align 4 1589 %2 = bitcast i32* %y to <32 x i32>* 1590 %3 = load <32 x i32>, <32 x i32>* %2, align 4 1591 %4 = mul nsw <32 x i32> %3, %1 1592 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 1593 ret i32 %5 1594} 1595 1596define i32 @mlav64i32i32(i32* %x, i32* %y) { 1597; CHECK-LABEL: mlav64i32i32: 1598; CHECK: @ %bb.0: @ %entry 1599; CHECK-NEXT: vldrw.u32 q0, [r0] 1600; CHECK-NEXT: vldrw.u32 q1, [r1] 1601; CHECK-NEXT: mov r2, r0 1602; CHECK-NEXT: vmlav.u32 r0, q1, q0 1603; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1604; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1605; CHECK-NEXT: vmlava.u32 r0, q1, q0 1606; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1607; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1608; CHECK-NEXT: vmlava.u32 r0, q1, q0 1609; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1610; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1611; CHECK-NEXT: vmlava.u32 r0, q1, q0 1612; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1613; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1614; CHECK-NEXT: vmlava.u32 r0, q1, q0 1615; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1616; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1617; CHECK-NEXT: vmlava.u32 r0, q1, q0 1618; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1619; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1620; CHECK-NEXT: vmlava.u32 r0, q1, q0 1621; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1622; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1623; CHECK-NEXT: vmlava.u32 r0, q1, q0 1624; CHECK-NEXT: vldrw.u32 q0, [r2, #128] 1625; CHECK-NEXT: vldrw.u32 q1, [r1, #128] 1626; CHECK-NEXT: vmlava.u32 r0, q1, q0 1627; CHECK-NEXT: vldrw.u32 q0, [r2, #144] 1628; CHECK-NEXT: vldrw.u32 q1, [r1, #144] 1629; CHECK-NEXT: vmlava.u32 r0, q1, q0 1630; CHECK-NEXT: vldrw.u32 q0, [r2, #160] 1631; CHECK-NEXT: vldrw.u32 q1, [r1, #160] 1632; CHECK-NEXT: vmlava.u32 r0, q1, q0 1633; CHECK-NEXT: vldrw.u32 q0, [r2, #176] 1634; CHECK-NEXT: vldrw.u32 q1, [r1, #176] 1635; CHECK-NEXT: vmlava.u32 r0, q1, q0 1636; CHECK-NEXT: vldrw.u32 q0, [r2, #192] 1637; CHECK-NEXT: vldrw.u32 q1, [r1, #192] 1638; CHECK-NEXT: vmlava.u32 r0, q1, q0 1639; CHECK-NEXT: vldrw.u32 q0, [r2, #208] 1640; CHECK-NEXT: vldrw.u32 q1, [r1, #208] 1641; CHECK-NEXT: vmlava.u32 r0, q1, q0 1642; CHECK-NEXT: vldrw.u32 q0, [r2, #224] 1643; CHECK-NEXT: vldrw.u32 q1, [r1, #224] 1644; CHECK-NEXT: vmlava.u32 r0, q1, q0 1645; CHECK-NEXT: vldrw.u32 q0, [r2, #240] 1646; CHECK-NEXT: vldrw.u32 q1, [r1, #240] 1647; CHECK-NEXT: vmlava.u32 r0, q1, q0 1648; CHECK-NEXT: bx lr 1649entry: 1650 %0 = bitcast i32* %x to <4 x i32>* 1651 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4 1652 %1 = bitcast i32* %y to <4 x i32>* 1653 %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4 1654 %2 = mul nsw <4 x i32> %wide.load10, %wide.load 1655 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1656 %4 = getelementptr inbounds i32, i32* %x, i32 4 1657 %5 = bitcast i32* %4 to <4 x i32>* 1658 %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4 1659 %6 = getelementptr inbounds i32, i32* %y, i32 4 1660 %7 = bitcast i32* %6 to <4 x i32>* 1661 %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4 1662 %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 1663 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) 1664 %10 = add i32 %9, %3 1665 %11 = getelementptr inbounds i32, i32* %x, i32 8 1666 %12 = bitcast i32* %11 to <4 x i32>* 1667 %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4 1668 %13 = getelementptr inbounds i32, i32* %y, i32 8 1669 %14 = bitcast i32* %13 to <4 x i32>* 1670 %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4 1671 %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 1672 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15) 1673 %17 = add i32 %16, %10 1674 %18 = getelementptr inbounds i32, i32* %x, i32 12 1675 %19 = bitcast i32* %18 to <4 x i32>* 1676 %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4 1677 %20 = getelementptr inbounds i32, i32* %y, i32 12 1678 %21 = bitcast i32* %20 to <4 x i32>* 1679 %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4 1680 %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 1681 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22) 1682 %24 = add i32 %23, %17 1683 %25 = getelementptr inbounds i32, i32* %x, i32 16 1684 %26 = bitcast i32* %25 to <4 x i32>* 1685 %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4 1686 %27 = getelementptr inbounds i32, i32* %y, i32 16 1687 %28 = bitcast i32* %27 to <4 x i32>* 1688 %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4 1689 %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 1690 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) 1691 %31 = add i32 %30, %24 1692 %32 = getelementptr inbounds i32, i32* %x, i32 20 1693 %33 = bitcast i32* %32 to <4 x i32>* 1694 %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4 1695 %34 = getelementptr inbounds i32, i32* %y, i32 20 1696 %35 = bitcast i32* %34 to <4 x i32>* 1697 %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4 1698 %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 1699 %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36) 1700 %38 = add i32 %37, %31 1701 %39 = getelementptr inbounds i32, i32* %x, i32 24 1702 %40 = bitcast i32* %39 to <4 x i32>* 1703 %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4 1704 %41 = getelementptr inbounds i32, i32* %y, i32 24 1705 %42 = bitcast i32* %41 to <4 x i32>* 1706 %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4 1707 %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 1708 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43) 1709 %45 = add i32 %44, %38 1710 %46 = getelementptr inbounds i32, i32* %x, i32 28 1711 %47 = bitcast i32* %46 to <4 x i32>* 1712 %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4 1713 %48 = getelementptr inbounds i32, i32* %y, i32 28 1714 %49 = bitcast i32* %48 to <4 x i32>* 1715 %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4 1716 %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 1717 %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50) 1718 %52 = add i32 %51, %45 1719 %53 = getelementptr inbounds i32, i32* %x, i32 32 1720 %54 = bitcast i32* %53 to <4 x i32>* 1721 %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4 1722 %55 = getelementptr inbounds i32, i32* %y, i32 32 1723 %56 = bitcast i32* %55 to <4 x i32>* 1724 %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4 1725 %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 1726 %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57) 1727 %59 = add i32 %58, %52 1728 %60 = getelementptr inbounds i32, i32* %x, i32 36 1729 %61 = bitcast i32* %60 to <4 x i32>* 1730 %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4 1731 %62 = getelementptr inbounds i32, i32* %y, i32 36 1732 %63 = bitcast i32* %62 to <4 x i32>* 1733 %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4 1734 %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 1735 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) 1736 %66 = add i32 %65, %59 1737 %67 = getelementptr inbounds i32, i32* %x, i32 40 1738 %68 = bitcast i32* %67 to <4 x i32>* 1739 %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4 1740 %69 = getelementptr inbounds i32, i32* %y, i32 40 1741 %70 = bitcast i32* %69 to <4 x i32>* 1742 %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4 1743 %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 1744 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71) 1745 %73 = add i32 %72, %66 1746 %74 = getelementptr inbounds i32, i32* %x, i32 44 1747 %75 = bitcast i32* %74 to <4 x i32>* 1748 %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4 1749 %76 = getelementptr inbounds i32, i32* %y, i32 44 1750 %77 = bitcast i32* %76 to <4 x i32>* 1751 %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4 1752 %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 1753 %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78) 1754 %80 = add i32 %79, %73 1755 %81 = getelementptr inbounds i32, i32* %x, i32 48 1756 %82 = bitcast i32* %81 to <4 x i32>* 1757 %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4 1758 %83 = getelementptr inbounds i32, i32* %y, i32 48 1759 %84 = bitcast i32* %83 to <4 x i32>* 1760 %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4 1761 %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 1762 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85) 1763 %87 = add i32 %86, %80 1764 %88 = getelementptr inbounds i32, i32* %x, i32 52 1765 %89 = bitcast i32* %88 to <4 x i32>* 1766 %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4 1767 %90 = getelementptr inbounds i32, i32* %y, i32 52 1768 %91 = bitcast i32* %90 to <4 x i32>* 1769 %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4 1770 %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 1771 %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92) 1772 %94 = add i32 %93, %87 1773 %95 = getelementptr inbounds i32, i32* %x, i32 56 1774 %96 = bitcast i32* %95 to <4 x i32>* 1775 %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4 1776 %97 = getelementptr inbounds i32, i32* %y, i32 56 1777 %98 = bitcast i32* %97 to <4 x i32>* 1778 %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4 1779 %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 1780 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99) 1781 %101 = add i32 %100, %94 1782 %102 = getelementptr inbounds i32, i32* %x, i32 60 1783 %103 = bitcast i32* %102 to <4 x i32>* 1784 %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4 1785 %104 = getelementptr inbounds i32, i32* %y, i32 60 1786 %105 = bitcast i32* %104 to <4 x i32>* 1787 %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4 1788 %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 1789 %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106) 1790 %108 = add i32 %107, %101 1791 ret i32 %108 1792} 1793 1794define i32 @mlav128i32i32(i32* %x, i32* %y) { 1795; CHECK-LABEL: mlav128i32i32: 1796; CHECK: @ %bb.0: @ %entry 1797; CHECK-NEXT: vldrw.u32 q0, [r0] 1798; CHECK-NEXT: vldrw.u32 q1, [r1] 1799; CHECK-NEXT: mov r2, r0 1800; CHECK-NEXT: vmlav.u32 r0, q1, q0 1801; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1802; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1803; CHECK-NEXT: vmlava.u32 r0, q1, q0 1804; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1805; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1806; CHECK-NEXT: vmlava.u32 r0, q1, q0 1807; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1808; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1809; CHECK-NEXT: vmlava.u32 r0, q1, q0 1810; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1811; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1812; CHECK-NEXT: vmlava.u32 r0, q1, q0 1813; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1814; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1815; CHECK-NEXT: vmlava.u32 r0, q1, q0 1816; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1817; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1818; CHECK-NEXT: vmlava.u32 r0, q1, q0 1819; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1820; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1821; CHECK-NEXT: vmlava.u32 r0, q1, q0 1822; CHECK-NEXT: vldrw.u32 q0, [r2, #128] 1823; CHECK-NEXT: vldrw.u32 q1, [r1, #128] 1824; CHECK-NEXT: vmlava.u32 r0, q1, q0 1825; CHECK-NEXT: vldrw.u32 q0, [r2, #144] 1826; CHECK-NEXT: vldrw.u32 q1, [r1, #144] 1827; CHECK-NEXT: vmlava.u32 r0, q1, q0 1828; CHECK-NEXT: vldrw.u32 q0, [r2, #160] 1829; CHECK-NEXT: vldrw.u32 q1, [r1, #160] 1830; CHECK-NEXT: vmlava.u32 r0, q1, q0 1831; CHECK-NEXT: vldrw.u32 q0, [r2, #176] 1832; CHECK-NEXT: vldrw.u32 q1, [r1, #176] 1833; CHECK-NEXT: vmlava.u32 r0, q1, q0 1834; CHECK-NEXT: vldrw.u32 q0, [r2, #192] 1835; CHECK-NEXT: vldrw.u32 q1, [r1, #192] 1836; CHECK-NEXT: vmlava.u32 r0, q1, q0 1837; CHECK-NEXT: vldrw.u32 q0, [r2, #208] 1838; CHECK-NEXT: vldrw.u32 q1, [r1, #208] 1839; CHECK-NEXT: vmlava.u32 r0, q1, q0 1840; CHECK-NEXT: vldrw.u32 q0, [r2, #224] 1841; CHECK-NEXT: vldrw.u32 q1, [r1, #224] 1842; CHECK-NEXT: vmlava.u32 r0, q1, q0 1843; CHECK-NEXT: vldrw.u32 q0, [r2, #240] 1844; CHECK-NEXT: vldrw.u32 q1, [r1, #240] 1845; CHECK-NEXT: vmlava.u32 r0, q1, q0 1846; CHECK-NEXT: vldrw.u32 q0, [r2, #256] 1847; CHECK-NEXT: vldrw.u32 q1, [r1, #256] 1848; CHECK-NEXT: vmlava.u32 r0, q1, q0 1849; CHECK-NEXT: vldrw.u32 q0, [r2, #272] 1850; CHECK-NEXT: vldrw.u32 q1, [r1, #272] 1851; CHECK-NEXT: vmlava.u32 r0, q1, q0 1852; CHECK-NEXT: vldrw.u32 q0, [r2, #288] 1853; CHECK-NEXT: vldrw.u32 q1, [r1, #288] 1854; CHECK-NEXT: vmlava.u32 r0, q1, q0 1855; CHECK-NEXT: vldrw.u32 q0, [r2, #304] 1856; CHECK-NEXT: vldrw.u32 q1, [r1, #304] 1857; CHECK-NEXT: vmlava.u32 r0, q1, q0 1858; CHECK-NEXT: vldrw.u32 q0, [r2, #320] 1859; CHECK-NEXT: vldrw.u32 q1, [r1, #320] 1860; CHECK-NEXT: vmlava.u32 r0, q1, q0 1861; CHECK-NEXT: vldrw.u32 q0, [r2, #336] 1862; CHECK-NEXT: vldrw.u32 q1, [r1, #336] 1863; CHECK-NEXT: vmlava.u32 r0, q1, q0 1864; CHECK-NEXT: vldrw.u32 q0, [r2, #352] 1865; CHECK-NEXT: vldrw.u32 q1, [r1, #352] 1866; CHECK-NEXT: vmlava.u32 r0, q1, q0 1867; CHECK-NEXT: vldrw.u32 q0, [r2, #368] 1868; CHECK-NEXT: vldrw.u32 q1, [r1, #368] 1869; CHECK-NEXT: vmlava.u32 r0, q1, q0 1870; CHECK-NEXT: vldrw.u32 q0, [r2, #384] 1871; CHECK-NEXT: vldrw.u32 q1, [r1, #384] 1872; CHECK-NEXT: vmlava.u32 r0, q1, q0 1873; CHECK-NEXT: vldrw.u32 q0, [r2, #400] 1874; CHECK-NEXT: vldrw.u32 q1, [r1, #400] 1875; CHECK-NEXT: vmlava.u32 r0, q1, q0 1876; CHECK-NEXT: vldrw.u32 q0, [r2, #416] 1877; CHECK-NEXT: vldrw.u32 q1, [r1, #416] 1878; CHECK-NEXT: vmlava.u32 r0, q1, q0 1879; CHECK-NEXT: vldrw.u32 q0, [r2, #432] 1880; CHECK-NEXT: vldrw.u32 q1, [r1, #432] 1881; CHECK-NEXT: vmlava.u32 r0, q1, q0 1882; CHECK-NEXT: vldrw.u32 q0, [r2, #448] 1883; CHECK-NEXT: vldrw.u32 q1, [r1, #448] 1884; CHECK-NEXT: vmlava.u32 r0, q1, q0 1885; CHECK-NEXT: vldrw.u32 q0, [r2, #464] 1886; CHECK-NEXT: vldrw.u32 q1, [r1, #464] 1887; CHECK-NEXT: vmlava.u32 r0, q1, q0 1888; CHECK-NEXT: vldrw.u32 q0, [r2, #480] 1889; CHECK-NEXT: vldrw.u32 q1, [r1, #480] 1890; CHECK-NEXT: vmlava.u32 r0, q1, q0 1891; CHECK-NEXT: vldrw.u32 q0, [r2, #496] 1892; CHECK-NEXT: vldrw.u32 q1, [r1, #496] 1893; CHECK-NEXT: vmlava.u32 r0, q1, q0 1894; CHECK-NEXT: bx lr 1895entry: 1896 %0 = bitcast i32* %x to <4 x i32>* 1897 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4 1898 %1 = bitcast i32* %y to <4 x i32>* 1899 %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4 1900 %2 = mul nsw <4 x i32> %wide.load10, %wide.load 1901 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1902 %4 = getelementptr inbounds i32, i32* %x, i32 4 1903 %5 = bitcast i32* %4 to <4 x i32>* 1904 %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4 1905 %6 = getelementptr inbounds i32, i32* %y, i32 4 1906 %7 = bitcast i32* %6 to <4 x i32>* 1907 %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4 1908 %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 1909 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) 1910 %10 = add i32 %9, %3 1911 %11 = getelementptr inbounds i32, i32* %x, i32 8 1912 %12 = bitcast i32* %11 to <4 x i32>* 1913 %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4 1914 %13 = getelementptr inbounds i32, i32* %y, i32 8 1915 %14 = bitcast i32* %13 to <4 x i32>* 1916 %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4 1917 %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 1918 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15) 1919 %17 = add i32 %16, %10 1920 %18 = getelementptr inbounds i32, i32* %x, i32 12 1921 %19 = bitcast i32* %18 to <4 x i32>* 1922 %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4 1923 %20 = getelementptr inbounds i32, i32* %y, i32 12 1924 %21 = bitcast i32* %20 to <4 x i32>* 1925 %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4 1926 %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 1927 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22) 1928 %24 = add i32 %23, %17 1929 %25 = getelementptr inbounds i32, i32* %x, i32 16 1930 %26 = bitcast i32* %25 to <4 x i32>* 1931 %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4 1932 %27 = getelementptr inbounds i32, i32* %y, i32 16 1933 %28 = bitcast i32* %27 to <4 x i32>* 1934 %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4 1935 %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 1936 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) 1937 %31 = add i32 %30, %24 1938 %32 = getelementptr inbounds i32, i32* %x, i32 20 1939 %33 = bitcast i32* %32 to <4 x i32>* 1940 %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4 1941 %34 = getelementptr inbounds i32, i32* %y, i32 20 1942 %35 = bitcast i32* %34 to <4 x i32>* 1943 %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4 1944 %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 1945 %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36) 1946 %38 = add i32 %37, %31 1947 %39 = getelementptr inbounds i32, i32* %x, i32 24 1948 %40 = bitcast i32* %39 to <4 x i32>* 1949 %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4 1950 %41 = getelementptr inbounds i32, i32* %y, i32 24 1951 %42 = bitcast i32* %41 to <4 x i32>* 1952 %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4 1953 %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 1954 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43) 1955 %45 = add i32 %44, %38 1956 %46 = getelementptr inbounds i32, i32* %x, i32 28 1957 %47 = bitcast i32* %46 to <4 x i32>* 1958 %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4 1959 %48 = getelementptr inbounds i32, i32* %y, i32 28 1960 %49 = bitcast i32* %48 to <4 x i32>* 1961 %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4 1962 %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 1963 %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50) 1964 %52 = add i32 %51, %45 1965 %53 = getelementptr inbounds i32, i32* %x, i32 32 1966 %54 = bitcast i32* %53 to <4 x i32>* 1967 %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4 1968 %55 = getelementptr inbounds i32, i32* %y, i32 32 1969 %56 = bitcast i32* %55 to <4 x i32>* 1970 %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4 1971 %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 1972 %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57) 1973 %59 = add i32 %58, %52 1974 %60 = getelementptr inbounds i32, i32* %x, i32 36 1975 %61 = bitcast i32* %60 to <4 x i32>* 1976 %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4 1977 %62 = getelementptr inbounds i32, i32* %y, i32 36 1978 %63 = bitcast i32* %62 to <4 x i32>* 1979 %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4 1980 %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 1981 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) 1982 %66 = add i32 %65, %59 1983 %67 = getelementptr inbounds i32, i32* %x, i32 40 1984 %68 = bitcast i32* %67 to <4 x i32>* 1985 %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4 1986 %69 = getelementptr inbounds i32, i32* %y, i32 40 1987 %70 = bitcast i32* %69 to <4 x i32>* 1988 %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4 1989 %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 1990 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71) 1991 %73 = add i32 %72, %66 1992 %74 = getelementptr inbounds i32, i32* %x, i32 44 1993 %75 = bitcast i32* %74 to <4 x i32>* 1994 %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4 1995 %76 = getelementptr inbounds i32, i32* %y, i32 44 1996 %77 = bitcast i32* %76 to <4 x i32>* 1997 %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4 1998 %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 1999 %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78) 2000 %80 = add i32 %79, %73 2001 %81 = getelementptr inbounds i32, i32* %x, i32 48 2002 %82 = bitcast i32* %81 to <4 x i32>* 2003 %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4 2004 %83 = getelementptr inbounds i32, i32* %y, i32 48 2005 %84 = bitcast i32* %83 to <4 x i32>* 2006 %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4 2007 %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 2008 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85) 2009 %87 = add i32 %86, %80 2010 %88 = getelementptr inbounds i32, i32* %x, i32 52 2011 %89 = bitcast i32* %88 to <4 x i32>* 2012 %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4 2013 %90 = getelementptr inbounds i32, i32* %y, i32 52 2014 %91 = bitcast i32* %90 to <4 x i32>* 2015 %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4 2016 %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 2017 %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92) 2018 %94 = add i32 %93, %87 2019 %95 = getelementptr inbounds i32, i32* %x, i32 56 2020 %96 = bitcast i32* %95 to <4 x i32>* 2021 %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4 2022 %97 = getelementptr inbounds i32, i32* %y, i32 56 2023 %98 = bitcast i32* %97 to <4 x i32>* 2024 %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4 2025 %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 2026 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99) 2027 %101 = add i32 %100, %94 2028 %102 = getelementptr inbounds i32, i32* %x, i32 60 2029 %103 = bitcast i32* %102 to <4 x i32>* 2030 %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4 2031 %104 = getelementptr inbounds i32, i32* %y, i32 60 2032 %105 = bitcast i32* %104 to <4 x i32>* 2033 %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4 2034 %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 2035 %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106) 2036 %108 = add i32 %107, %101 2037 %109 = getelementptr inbounds i32, i32* %x, i32 64 2038 %110 = bitcast i32* %109 to <4 x i32>* 2039 %wide.load.16 = load <4 x i32>, <4 x i32>* %110, align 4 2040 %111 = getelementptr inbounds i32, i32* %y, i32 64 2041 %112 = bitcast i32* %111 to <4 x i32>* 2042 %wide.load10.16 = load <4 x i32>, <4 x i32>* %112, align 4 2043 %113 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16 2044 %114 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %113) 2045 %115 = add i32 %114, %108 2046 %116 = getelementptr inbounds i32, i32* %x, i32 68 2047 %117 = bitcast i32* %116 to <4 x i32>* 2048 %wide.load.17 = load <4 x i32>, <4 x i32>* %117, align 4 2049 %118 = getelementptr inbounds i32, i32* %y, i32 68 2050 %119 = bitcast i32* %118 to <4 x i32>* 2051 %wide.load10.17 = load <4 x i32>, <4 x i32>* %119, align 4 2052 %120 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17 2053 %121 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %120) 2054 %122 = add i32 %121, %115 2055 %123 = getelementptr inbounds i32, i32* %x, i32 72 2056 %124 = bitcast i32* %123 to <4 x i32>* 2057 %wide.load.18 = load <4 x i32>, <4 x i32>* %124, align 4 2058 %125 = getelementptr inbounds i32, i32* %y, i32 72 2059 %126 = bitcast i32* %125 to <4 x i32>* 2060 %wide.load10.18 = load <4 x i32>, <4 x i32>* %126, align 4 2061 %127 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18 2062 %128 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %127) 2063 %129 = add i32 %128, %122 2064 %130 = getelementptr inbounds i32, i32* %x, i32 76 2065 %131 = bitcast i32* %130 to <4 x i32>* 2066 %wide.load.19 = load <4 x i32>, <4 x i32>* %131, align 4 2067 %132 = getelementptr inbounds i32, i32* %y, i32 76 2068 %133 = bitcast i32* %132 to <4 x i32>* 2069 %wide.load10.19 = load <4 x i32>, <4 x i32>* %133, align 4 2070 %134 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19 2071 %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134) 2072 %136 = add i32 %135, %129 2073 %137 = getelementptr inbounds i32, i32* %x, i32 80 2074 %138 = bitcast i32* %137 to <4 x i32>* 2075 %wide.load.20 = load <4 x i32>, <4 x i32>* %138, align 4 2076 %139 = getelementptr inbounds i32, i32* %y, i32 80 2077 %140 = bitcast i32* %139 to <4 x i32>* 2078 %wide.load10.20 = load <4 x i32>, <4 x i32>* %140, align 4 2079 %141 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20 2080 %142 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %141) 2081 %143 = add i32 %142, %136 2082 %144 = getelementptr inbounds i32, i32* %x, i32 84 2083 %145 = bitcast i32* %144 to <4 x i32>* 2084 %wide.load.21 = load <4 x i32>, <4 x i32>* %145, align 4 2085 %146 = getelementptr inbounds i32, i32* %y, i32 84 2086 %147 = bitcast i32* %146 to <4 x i32>* 2087 %wide.load10.21 = load <4 x i32>, <4 x i32>* %147, align 4 2088 %148 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21 2089 %149 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %148) 2090 %150 = add i32 %149, %143 2091 %151 = getelementptr inbounds i32, i32* %x, i32 88 2092 %152 = bitcast i32* %151 to <4 x i32>* 2093 %wide.load.22 = load <4 x i32>, <4 x i32>* %152, align 4 2094 %153 = getelementptr inbounds i32, i32* %y, i32 88 2095 %154 = bitcast i32* %153 to <4 x i32>* 2096 %wide.load10.22 = load <4 x i32>, <4 x i32>* %154, align 4 2097 %155 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22 2098 %156 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %155) 2099 %157 = add i32 %156, %150 2100 %158 = getelementptr inbounds i32, i32* %x, i32 92 2101 %159 = bitcast i32* %158 to <4 x i32>* 2102 %wide.load.23 = load <4 x i32>, <4 x i32>* %159, align 4 2103 %160 = getelementptr inbounds i32, i32* %y, i32 92 2104 %161 = bitcast i32* %160 to <4 x i32>* 2105 %wide.load10.23 = load <4 x i32>, <4 x i32>* %161, align 4 2106 %162 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23 2107 %163 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %162) 2108 %164 = add i32 %163, %157 2109 %165 = getelementptr inbounds i32, i32* %x, i32 96 2110 %166 = bitcast i32* %165 to <4 x i32>* 2111 %wide.load.24 = load <4 x i32>, <4 x i32>* %166, align 4 2112 %167 = getelementptr inbounds i32, i32* %y, i32 96 2113 %168 = bitcast i32* %167 to <4 x i32>* 2114 %wide.load10.24 = load <4 x i32>, <4 x i32>* %168, align 4 2115 %169 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24 2116 %170 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %169) 2117 %171 = add i32 %170, %164 2118 %172 = getelementptr inbounds i32, i32* %x, i32 100 2119 %173 = bitcast i32* %172 to <4 x i32>* 2120 %wide.load.25 = load <4 x i32>, <4 x i32>* %173, align 4 2121 %174 = getelementptr inbounds i32, i32* %y, i32 100 2122 %175 = bitcast i32* %174 to <4 x i32>* 2123 %wide.load10.25 = load <4 x i32>, <4 x i32>* %175, align 4 2124 %176 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25 2125 %177 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %176) 2126 %178 = add i32 %177, %171 2127 %179 = getelementptr inbounds i32, i32* %x, i32 104 2128 %180 = bitcast i32* %179 to <4 x i32>* 2129 %wide.load.26 = load <4 x i32>, <4 x i32>* %180, align 4 2130 %181 = getelementptr inbounds i32, i32* %y, i32 104 2131 %182 = bitcast i32* %181 to <4 x i32>* 2132 %wide.load10.26 = load <4 x i32>, <4 x i32>* %182, align 4 2133 %183 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26 2134 %184 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %183) 2135 %185 = add i32 %184, %178 2136 %186 = getelementptr inbounds i32, i32* %x, i32 108 2137 %187 = bitcast i32* %186 to <4 x i32>* 2138 %wide.load.27 = load <4 x i32>, <4 x i32>* %187, align 4 2139 %188 = getelementptr inbounds i32, i32* %y, i32 108 2140 %189 = bitcast i32* %188 to <4 x i32>* 2141 %wide.load10.27 = load <4 x i32>, <4 x i32>* %189, align 4 2142 %190 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27 2143 %191 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %190) 2144 %192 = add i32 %191, %185 2145 %193 = getelementptr inbounds i32, i32* %x, i32 112 2146 %194 = bitcast i32* %193 to <4 x i32>* 2147 %wide.load.28 = load <4 x i32>, <4 x i32>* %194, align 4 2148 %195 = getelementptr inbounds i32, i32* %y, i32 112 2149 %196 = bitcast i32* %195 to <4 x i32>* 2150 %wide.load10.28 = load <4 x i32>, <4 x i32>* %196, align 4 2151 %197 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28 2152 %198 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %197) 2153 %199 = add i32 %198, %192 2154 %200 = getelementptr inbounds i32, i32* %x, i32 116 2155 %201 = bitcast i32* %200 to <4 x i32>* 2156 %wide.load.29 = load <4 x i32>, <4 x i32>* %201, align 4 2157 %202 = getelementptr inbounds i32, i32* %y, i32 116 2158 %203 = bitcast i32* %202 to <4 x i32>* 2159 %wide.load10.29 = load <4 x i32>, <4 x i32>* %203, align 4 2160 %204 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29 2161 %205 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %204) 2162 %206 = add i32 %205, %199 2163 %207 = getelementptr inbounds i32, i32* %x, i32 120 2164 %208 = bitcast i32* %207 to <4 x i32>* 2165 %wide.load.30 = load <4 x i32>, <4 x i32>* %208, align 4 2166 %209 = getelementptr inbounds i32, i32* %y, i32 120 2167 %210 = bitcast i32* %209 to <4 x i32>* 2168 %wide.load10.30 = load <4 x i32>, <4 x i32>* %210, align 4 2169 %211 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30 2170 %212 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %211) 2171 %213 = add i32 %212, %206 2172 %214 = getelementptr inbounds i32, i32* %x, i32 124 2173 %215 = bitcast i32* %214 to <4 x i32>* 2174 %wide.load.31 = load <4 x i32>, <4 x i32>* %215, align 4 2175 %216 = getelementptr inbounds i32, i32* %y, i32 124 2176 %217 = bitcast i32* %216 to <4 x i32>* 2177 %wide.load10.31 = load <4 x i32>, <4 x i32>* %217, align 4 2178 %218 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31 2179 %219 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %218) 2180 %220 = add i32 %219, %213 2181 ret i32 %220 2182} 2183 2184define i32 @mlav2i32i16(i16* %x, i16* %y) { 2185; CHECK-LABEL: mlav2i32i16: 2186; CHECK: @ %bb.0: @ %entry 2187; CHECK-NEXT: ldrsh.w r2, [r0] 2188; CHECK-NEXT: ldrsh.w r3, [r1] 2189; CHECK-NEXT: ldrsh.w r0, [r0, #2] 2190; CHECK-NEXT: ldrsh.w r1, [r1, #2] 2191; CHECK-NEXT: muls r0, r1, r0 2192; CHECK-NEXT: smlabb r0, r3, r2, r0 2193; CHECK-NEXT: bx lr 2194entry: 2195 %0 = load i16, i16* %x, align 2 2196 %conv = sext i16 %0 to i32 2197 %1 = load i16, i16* %y, align 2 2198 %conv2 = sext i16 %1 to i32 2199 %mul = mul nsw i32 %conv2, %conv 2200 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1 2201 %2 = load i16, i16* %arrayidx.1, align 2 2202 %conv.1 = sext i16 %2 to i32 2203 %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1 2204 %3 = load i16, i16* %arrayidx1.1, align 2 2205 %conv2.1 = sext i16 %3 to i32 2206 %mul.1 = mul nsw i32 %conv2.1, %conv.1 2207 %add.1 = add nsw i32 %mul.1, %mul 2208 ret i32 %add.1 2209} 2210 2211define i32 @mlav4i32i16(i16* %x, i16* %y) { 2212; CHECK-LABEL: mlav4i32i16: 2213; CHECK: @ %bb.0: @ %entry 2214; CHECK-NEXT: vldrh.s32 q0, [r0] 2215; CHECK-NEXT: vldrh.s32 q1, [r1] 2216; CHECK-NEXT: vmlav.u32 r0, q1, q0 2217; CHECK-NEXT: bx lr 2218entry: 2219 %0 = bitcast i16* %x to <4 x i16>* 2220 %1 = load <4 x i16>, <4 x i16>* %0, align 2 2221 %2 = sext <4 x i16> %1 to <4 x i32> 2222 %3 = bitcast i16* %y to <4 x i16>* 2223 %4 = load <4 x i16>, <4 x i16>* %3, align 2 2224 %5 = sext <4 x i16> %4 to <4 x i32> 2225 %6 = mul nsw <4 x i32> %5, %2 2226 %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6) 2227 ret i32 %7 2228} 2229 2230define i32 @mlav8i32i16(i16* %x, i16* %y) { 2231; CHECK-LABEL: mlav8i32i16: 2232; CHECK: @ %bb.0: @ %entry 2233; CHECK-NEXT: vldrh.u16 q0, [r0] 2234; CHECK-NEXT: vldrh.u16 q1, [r1] 2235; CHECK-NEXT: vmlav.s16 r0, q1, q0 2236; CHECK-NEXT: bx lr 2237entry: 2238 %0 = bitcast i16* %x to <8 x i16>* 2239 %1 = load <8 x i16>, <8 x i16>* %0, align 2 2240 %2 = sext <8 x i16> %1 to <8 x i32> 2241 %3 = bitcast i16* %y to <8 x i16>* 2242 %4 = load <8 x i16>, <8 x i16>* %3, align 2 2243 %5 = sext <8 x i16> %4 to <8 x i32> 2244 %6 = mul nsw <8 x i32> %5, %2 2245 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 2246 ret i32 %7 2247} 2248 2249define i32 @mlav16i32i16(i16* %x, i16* %y) { 2250; CHECK-LABEL: mlav16i32i16: 2251; CHECK: @ %bb.0: @ %entry 2252; CHECK-NEXT: vldrh.s32 q0, [r0] 2253; CHECK-NEXT: vldrh.s32 q1, [r1] 2254; CHECK-NEXT: vmlav.u32 r2, q1, q0 2255; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 2256; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 2257; CHECK-NEXT: vmlava.u32 r2, q1, q0 2258; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 2259; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2260; CHECK-NEXT: vmlava.u32 r2, q1, q0 2261; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 2262; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2263; CHECK-NEXT: vmlava.u32 r2, q1, q0 2264; CHECK-NEXT: mov r0, r2 2265; CHECK-NEXT: bx lr 2266entry: 2267 %0 = bitcast i16* %x to <16 x i16>* 2268 %1 = load <16 x i16>, <16 x i16>* %0, align 2 2269 %2 = sext <16 x i16> %1 to <16 x i32> 2270 %3 = bitcast i16* %y to <16 x i16>* 2271 %4 = load <16 x i16>, <16 x i16>* %3, align 2 2272 %5 = sext <16 x i16> %4 to <16 x i32> 2273 %6 = mul nsw <16 x i32> %5, %2 2274 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) 2275 ret i32 %7 2276} 2277 2278define i32 @mlav24i32i16(i16* %x, i16* %y) { 2279; CHECK-LABEL: mlav24i32i16: 2280; CHECK: @ %bb.0: @ %entry 2281; CHECK-NEXT: vldrh.u16 q0, [r0] 2282; CHECK-NEXT: vldrh.u16 q1, [r1] 2283; CHECK-NEXT: mov r2, r0 2284; CHECK-NEXT: vmlav.s16 r0, q1, q0 2285; CHECK-NEXT: vldrh.s32 q0, [r2, #16] 2286; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2287; CHECK-NEXT: vmlava.u32 r0, q1, q0 2288; CHECK-NEXT: vldrh.s32 q0, [r2, #24] 2289; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2290; CHECK-NEXT: vmlava.u32 r0, q1, q0 2291; CHECK-NEXT: vldrh.s32 q0, [r2, #32] 2292; CHECK-NEXT: vldrh.s32 q1, [r1, #32] 2293; CHECK-NEXT: vmlava.u32 r0, q1, q0 2294; CHECK-NEXT: vldrh.s32 q0, [r2, #40] 2295; CHECK-NEXT: vldrh.s32 q1, [r1, #40] 2296; CHECK-NEXT: vmlava.u32 r0, q1, q0 2297; CHECK-NEXT: bx lr 2298entry: 2299 %0 = bitcast i16* %x to <8 x i16>* 2300 %1 = load <8 x i16>, <8 x i16>* %0, align 2 2301 %2 = sext <8 x i16> %1 to <8 x i32> 2302 %3 = bitcast i16* %y to <8 x i16>* 2303 %4 = load <8 x i16>, <8 x i16>* %3, align 2 2304 %5 = sext <8 x i16> %4 to <8 x i32> 2305 %6 = mul nsw <8 x i32> %5, %2 2306 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8 2307 %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8 2308 %7 = bitcast i16* %arrayidx.8 to <16 x i16>* 2309 %8 = load <16 x i16>, <16 x i16>* %7, align 2 2310 %9 = sext <16 x i16> %8 to <16 x i32> 2311 %10 = bitcast i16* %arrayidx1.8 to <16 x i16>* 2312 %11 = load <16 x i16>, <16 x i16>* %10, align 2 2313 %12 = sext <16 x i16> %11 to <16 x i32> 2314 %13 = mul nsw <16 x i32> %12, %9 2315 %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13) 2316 %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 2317 %op.rdx = add nsw i32 %14, %15 2318 ret i32 %op.rdx 2319} 2320 2321define i32 @mlav32i32i16(i16* %x, i16* %y) { 2322; CHECK-LABEL: mlav32i32i16: 2323; CHECK: @ %bb.0: @ %entry 2324; CHECK-NEXT: vldrh.s32 q0, [r0] 2325; CHECK-NEXT: vldrh.s32 q1, [r1] 2326; CHECK-NEXT: mov r2, r0 2327; CHECK-NEXT: vmlav.u32 r0, q1, q0 2328; CHECK-NEXT: vldrh.s32 q0, [r2, #8] 2329; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 2330; CHECK-NEXT: vmlava.u32 r0, q1, q0 2331; CHECK-NEXT: vldrh.s32 q0, [r2, #16] 2332; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2333; CHECK-NEXT: vmlava.u32 r0, q1, q0 2334; CHECK-NEXT: vldrh.s32 q0, [r2, #24] 2335; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2336; CHECK-NEXT: vmlava.u32 r0, q1, q0 2337; CHECK-NEXT: vldrh.s32 q0, [r2, #32] 2338; CHECK-NEXT: vldrh.s32 q1, [r1, #32] 2339; CHECK-NEXT: vmlava.u32 r0, q1, q0 2340; CHECK-NEXT: vldrh.s32 q0, [r2, #40] 2341; CHECK-NEXT: vldrh.s32 q1, [r1, #40] 2342; CHECK-NEXT: vmlava.u32 r0, q1, q0 2343; CHECK-NEXT: vldrh.s32 q0, [r2, #48] 2344; CHECK-NEXT: vldrh.s32 q1, [r1, #48] 2345; CHECK-NEXT: vmlava.u32 r0, q1, q0 2346; CHECK-NEXT: vldrh.s32 q0, [r2, #56] 2347; CHECK-NEXT: vldrh.s32 q1, [r1, #56] 2348; CHECK-NEXT: vmlava.u32 r0, q1, q0 2349; CHECK-NEXT: bx lr 2350entry: 2351 %0 = bitcast i16* %x to <32 x i16>* 2352 %1 = load <32 x i16>, <32 x i16>* %0, align 2 2353 %2 = sext <32 x i16> %1 to <32 x i32> 2354 %3 = bitcast i16* %y to <32 x i16>* 2355 %4 = load <32 x i16>, <32 x i16>* %3, align 2 2356 %5 = sext <32 x i16> %4 to <32 x i32> 2357 %6 = mul nsw <32 x i32> %5, %2 2358 %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6) 2359 ret i32 %7 2360} 2361 2362define i32 @mlav64i32i16(i16* %x, i16* %y) { 2363; CHECK-LABEL: mlav64i32i16: 2364; CHECK: @ %bb.0: @ %entry 2365; CHECK-NEXT: vldrh.u16 q0, [r0] 2366; CHECK-NEXT: vldrh.u16 q1, [r1] 2367; CHECK-NEXT: mov r2, r0 2368; CHECK-NEXT: vmlav.s16 r0, q1, q0 2369; CHECK-NEXT: vldrh.u16 q0, [r2, #16] 2370; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2371; CHECK-NEXT: vmlava.s16 r0, q1, q0 2372; CHECK-NEXT: vldrh.u16 q0, [r2, #32] 2373; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2374; CHECK-NEXT: vmlava.s16 r0, q1, q0 2375; CHECK-NEXT: vldrh.u16 q0, [r2, #48] 2376; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2377; CHECK-NEXT: vmlava.s16 r0, q1, q0 2378; CHECK-NEXT: vldrh.u16 q0, [r2, #64] 2379; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2380; CHECK-NEXT: vmlava.s16 r0, q1, q0 2381; CHECK-NEXT: vldrh.u16 q0, [r2, #80] 2382; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2383; CHECK-NEXT: vmlava.s16 r0, q1, q0 2384; CHECK-NEXT: vldrh.u16 q0, [r2, #96] 2385; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2386; CHECK-NEXT: vmlava.s16 r0, q1, q0 2387; CHECK-NEXT: vldrh.u16 q0, [r2, #112] 2388; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2389; CHECK-NEXT: vmlava.s16 r0, q1, q0 2390; CHECK-NEXT: bx lr 2391entry: 2392 %0 = bitcast i16* %x to <8 x i16>* 2393 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 2394 %1 = sext <8 x i16> %wide.load to <8 x i32> 2395 %2 = bitcast i16* %y to <8 x i16>* 2396 %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2 2397 %3 = sext <8 x i16> %wide.load11 to <8 x i32> 2398 %4 = mul nsw <8 x i32> %3, %1 2399 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2400 %6 = getelementptr inbounds i16, i16* %x, i32 8 2401 %7 = bitcast i16* %6 to <8 x i16>* 2402 %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2 2403 %8 = sext <8 x i16> %wide.load.1 to <8 x i32> 2404 %9 = getelementptr inbounds i16, i16* %y, i32 8 2405 %10 = bitcast i16* %9 to <8 x i16>* 2406 %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2 2407 %11 = sext <8 x i16> %wide.load11.1 to <8 x i32> 2408 %12 = mul nsw <8 x i32> %11, %8 2409 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12) 2410 %14 = add i32 %13, %5 2411 %15 = getelementptr inbounds i16, i16* %x, i32 16 2412 %16 = bitcast i16* %15 to <8 x i16>* 2413 %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2 2414 %17 = sext <8 x i16> %wide.load.2 to <8 x i32> 2415 %18 = getelementptr inbounds i16, i16* %y, i32 16 2416 %19 = bitcast i16* %18 to <8 x i16>* 2417 %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2 2418 %20 = sext <8 x i16> %wide.load11.2 to <8 x i32> 2419 %21 = mul nsw <8 x i32> %20, %17 2420 %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21) 2421 %23 = add i32 %22, %14 2422 %24 = getelementptr inbounds i16, i16* %x, i32 24 2423 %25 = bitcast i16* %24 to <8 x i16>* 2424 %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2 2425 %26 = sext <8 x i16> %wide.load.3 to <8 x i32> 2426 %27 = getelementptr inbounds i16, i16* %y, i32 24 2427 %28 = bitcast i16* %27 to <8 x i16>* 2428 %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2 2429 %29 = sext <8 x i16> %wide.load11.3 to <8 x i32> 2430 %30 = mul nsw <8 x i32> %29, %26 2431 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30) 2432 %32 = add i32 %31, %23 2433 %33 = getelementptr inbounds i16, i16* %x, i32 32 2434 %34 = bitcast i16* %33 to <8 x i16>* 2435 %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2 2436 %35 = sext <8 x i16> %wide.load.4 to <8 x i32> 2437 %36 = getelementptr inbounds i16, i16* %y, i32 32 2438 %37 = bitcast i16* %36 to <8 x i16>* 2439 %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2 2440 %38 = sext <8 x i16> %wide.load11.4 to <8 x i32> 2441 %39 = mul nsw <8 x i32> %38, %35 2442 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39) 2443 %41 = add i32 %40, %32 2444 %42 = getelementptr inbounds i16, i16* %x, i32 40 2445 %43 = bitcast i16* %42 to <8 x i16>* 2446 %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2 2447 %44 = sext <8 x i16> %wide.load.5 to <8 x i32> 2448 %45 = getelementptr inbounds i16, i16* %y, i32 40 2449 %46 = bitcast i16* %45 to <8 x i16>* 2450 %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2 2451 %47 = sext <8 x i16> %wide.load11.5 to <8 x i32> 2452 %48 = mul nsw <8 x i32> %47, %44 2453 %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48) 2454 %50 = add i32 %49, %41 2455 %51 = getelementptr inbounds i16, i16* %x, i32 48 2456 %52 = bitcast i16* %51 to <8 x i16>* 2457 %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2 2458 %53 = sext <8 x i16> %wide.load.6 to <8 x i32> 2459 %54 = getelementptr inbounds i16, i16* %y, i32 48 2460 %55 = bitcast i16* %54 to <8 x i16>* 2461 %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2 2462 %56 = sext <8 x i16> %wide.load11.6 to <8 x i32> 2463 %57 = mul nsw <8 x i32> %56, %53 2464 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57) 2465 %59 = add i32 %58, %50 2466 %60 = getelementptr inbounds i16, i16* %x, i32 56 2467 %61 = bitcast i16* %60 to <8 x i16>* 2468 %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2 2469 %62 = sext <8 x i16> %wide.load.7 to <8 x i32> 2470 %63 = getelementptr inbounds i16, i16* %y, i32 56 2471 %64 = bitcast i16* %63 to <8 x i16>* 2472 %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2 2473 %65 = sext <8 x i16> %wide.load11.7 to <8 x i32> 2474 %66 = mul nsw <8 x i32> %65, %62 2475 %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66) 2476 %68 = add i32 %67, %59 2477 ret i32 %68 2478} 2479 2480define i32 @mlav128i32i16(i16* %x, i16* %y) { 2481; CHECK-LABEL: mlav128i32i16: 2482; CHECK: @ %bb.0: @ %entry 2483; CHECK-NEXT: vldrh.u16 q0, [r0] 2484; CHECK-NEXT: vldrh.u16 q1, [r1] 2485; CHECK-NEXT: mov r2, r0 2486; CHECK-NEXT: vmlav.s16 r0, q1, q0 2487; CHECK-NEXT: vldrh.u16 q0, [r2, #16] 2488; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2489; CHECK-NEXT: vmlava.s16 r0, q1, q0 2490; CHECK-NEXT: vldrh.u16 q0, [r2, #32] 2491; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2492; CHECK-NEXT: vmlava.s16 r0, q1, q0 2493; CHECK-NEXT: vldrh.u16 q0, [r2, #48] 2494; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2495; CHECK-NEXT: vmlava.s16 r0, q1, q0 2496; CHECK-NEXT: vldrh.u16 q0, [r2, #64] 2497; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2498; CHECK-NEXT: vmlava.s16 r0, q1, q0 2499; CHECK-NEXT: vldrh.u16 q0, [r2, #80] 2500; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2501; CHECK-NEXT: vmlava.s16 r0, q1, q0 2502; CHECK-NEXT: vldrh.u16 q0, [r2, #96] 2503; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2504; CHECK-NEXT: vmlava.s16 r0, q1, q0 2505; CHECK-NEXT: vldrh.u16 q0, [r2, #112] 2506; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2507; CHECK-NEXT: vmlava.s16 r0, q1, q0 2508; CHECK-NEXT: vldrh.u16 q0, [r2, #128] 2509; CHECK-NEXT: vldrh.u16 q1, [r1, #128] 2510; CHECK-NEXT: vmlava.s16 r0, q1, q0 2511; CHECK-NEXT: vldrh.u16 q0, [r2, #144] 2512; CHECK-NEXT: vldrh.u16 q1, [r1, #144] 2513; CHECK-NEXT: vmlava.s16 r0, q1, q0 2514; CHECK-NEXT: vldrh.u16 q0, [r2, #160] 2515; CHECK-NEXT: vldrh.u16 q1, [r1, #160] 2516; CHECK-NEXT: vmlava.s16 r0, q1, q0 2517; CHECK-NEXT: vldrh.u16 q0, [r2, #176] 2518; CHECK-NEXT: vldrh.u16 q1, [r1, #176] 2519; CHECK-NEXT: vmlava.s16 r0, q1, q0 2520; CHECK-NEXT: vldrh.u16 q0, [r2, #192] 2521; CHECK-NEXT: vldrh.u16 q1, [r1, #192] 2522; CHECK-NEXT: vmlava.s16 r0, q1, q0 2523; CHECK-NEXT: vldrh.u16 q0, [r2, #208] 2524; CHECK-NEXT: vldrh.u16 q1, [r1, #208] 2525; CHECK-NEXT: vmlava.s16 r0, q1, q0 2526; CHECK-NEXT: vldrh.u16 q0, [r2, #224] 2527; CHECK-NEXT: vldrh.u16 q1, [r1, #224] 2528; CHECK-NEXT: vmlava.s16 r0, q1, q0 2529; CHECK-NEXT: vldrh.u16 q0, [r2, #240] 2530; CHECK-NEXT: vldrh.u16 q1, [r1, #240] 2531; CHECK-NEXT: vmlava.s16 r0, q1, q0 2532; CHECK-NEXT: bx lr 2533entry: 2534 %0 = bitcast i16* %x to <8 x i16>* 2535 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 2536 %1 = sext <8 x i16> %wide.load to <8 x i32> 2537 %2 = bitcast i16* %y to <8 x i16>* 2538 %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2 2539 %3 = sext <8 x i16> %wide.load11 to <8 x i32> 2540 %4 = mul nsw <8 x i32> %3, %1 2541 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2542 %6 = getelementptr inbounds i16, i16* %x, i32 8 2543 %7 = bitcast i16* %6 to <8 x i16>* 2544 %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2 2545 %8 = sext <8 x i16> %wide.load.1 to <8 x i32> 2546 %9 = getelementptr inbounds i16, i16* %y, i32 8 2547 %10 = bitcast i16* %9 to <8 x i16>* 2548 %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2 2549 %11 = sext <8 x i16> %wide.load11.1 to <8 x i32> 2550 %12 = mul nsw <8 x i32> %11, %8 2551 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12) 2552 %14 = add i32 %13, %5 2553 %15 = getelementptr inbounds i16, i16* %x, i32 16 2554 %16 = bitcast i16* %15 to <8 x i16>* 2555 %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2 2556 %17 = sext <8 x i16> %wide.load.2 to <8 x i32> 2557 %18 = getelementptr inbounds i16, i16* %y, i32 16 2558 %19 = bitcast i16* %18 to <8 x i16>* 2559 %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2 2560 %20 = sext <8 x i16> %wide.load11.2 to <8 x i32> 2561 %21 = mul nsw <8 x i32> %20, %17 2562 %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21) 2563 %23 = add i32 %22, %14 2564 %24 = getelementptr inbounds i16, i16* %x, i32 24 2565 %25 = bitcast i16* %24 to <8 x i16>* 2566 %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2 2567 %26 = sext <8 x i16> %wide.load.3 to <8 x i32> 2568 %27 = getelementptr inbounds i16, i16* %y, i32 24 2569 %28 = bitcast i16* %27 to <8 x i16>* 2570 %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2 2571 %29 = sext <8 x i16> %wide.load11.3 to <8 x i32> 2572 %30 = mul nsw <8 x i32> %29, %26 2573 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30) 2574 %32 = add i32 %31, %23 2575 %33 = getelementptr inbounds i16, i16* %x, i32 32 2576 %34 = bitcast i16* %33 to <8 x i16>* 2577 %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2 2578 %35 = sext <8 x i16> %wide.load.4 to <8 x i32> 2579 %36 = getelementptr inbounds i16, i16* %y, i32 32 2580 %37 = bitcast i16* %36 to <8 x i16>* 2581 %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2 2582 %38 = sext <8 x i16> %wide.load11.4 to <8 x i32> 2583 %39 = mul nsw <8 x i32> %38, %35 2584 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39) 2585 %41 = add i32 %40, %32 2586 %42 = getelementptr inbounds i16, i16* %x, i32 40 2587 %43 = bitcast i16* %42 to <8 x i16>* 2588 %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2 2589 %44 = sext <8 x i16> %wide.load.5 to <8 x i32> 2590 %45 = getelementptr inbounds i16, i16* %y, i32 40 2591 %46 = bitcast i16* %45 to <8 x i16>* 2592 %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2 2593 %47 = sext <8 x i16> %wide.load11.5 to <8 x i32> 2594 %48 = mul nsw <8 x i32> %47, %44 2595 %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48) 2596 %50 = add i32 %49, %41 2597 %51 = getelementptr inbounds i16, i16* %x, i32 48 2598 %52 = bitcast i16* %51 to <8 x i16>* 2599 %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2 2600 %53 = sext <8 x i16> %wide.load.6 to <8 x i32> 2601 %54 = getelementptr inbounds i16, i16* %y, i32 48 2602 %55 = bitcast i16* %54 to <8 x i16>* 2603 %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2 2604 %56 = sext <8 x i16> %wide.load11.6 to <8 x i32> 2605 %57 = mul nsw <8 x i32> %56, %53 2606 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57) 2607 %59 = add i32 %58, %50 2608 %60 = getelementptr inbounds i16, i16* %x, i32 56 2609 %61 = bitcast i16* %60 to <8 x i16>* 2610 %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2 2611 %62 = sext <8 x i16> %wide.load.7 to <8 x i32> 2612 %63 = getelementptr inbounds i16, i16* %y, i32 56 2613 %64 = bitcast i16* %63 to <8 x i16>* 2614 %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2 2615 %65 = sext <8 x i16> %wide.load11.7 to <8 x i32> 2616 %66 = mul nsw <8 x i32> %65, %62 2617 %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66) 2618 %68 = add i32 %67, %59 2619 %69 = getelementptr inbounds i16, i16* %x, i32 64 2620 %70 = bitcast i16* %69 to <8 x i16>* 2621 %wide.load.8 = load <8 x i16>, <8 x i16>* %70, align 2 2622 %71 = sext <8 x i16> %wide.load.8 to <8 x i32> 2623 %72 = getelementptr inbounds i16, i16* %y, i32 64 2624 %73 = bitcast i16* %72 to <8 x i16>* 2625 %wide.load11.8 = load <8 x i16>, <8 x i16>* %73, align 2 2626 %74 = sext <8 x i16> %wide.load11.8 to <8 x i32> 2627 %75 = mul nsw <8 x i32> %74, %71 2628 %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75) 2629 %77 = add i32 %76, %68 2630 %78 = getelementptr inbounds i16, i16* %x, i32 72 2631 %79 = bitcast i16* %78 to <8 x i16>* 2632 %wide.load.9 = load <8 x i16>, <8 x i16>* %79, align 2 2633 %80 = sext <8 x i16> %wide.load.9 to <8 x i32> 2634 %81 = getelementptr inbounds i16, i16* %y, i32 72 2635 %82 = bitcast i16* %81 to <8 x i16>* 2636 %wide.load11.9 = load <8 x i16>, <8 x i16>* %82, align 2 2637 %83 = sext <8 x i16> %wide.load11.9 to <8 x i32> 2638 %84 = mul nsw <8 x i32> %83, %80 2639 %85 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %84) 2640 %86 = add i32 %85, %77 2641 %87 = getelementptr inbounds i16, i16* %x, i32 80 2642 %88 = bitcast i16* %87 to <8 x i16>* 2643 %wide.load.10 = load <8 x i16>, <8 x i16>* %88, align 2 2644 %89 = sext <8 x i16> %wide.load.10 to <8 x i32> 2645 %90 = getelementptr inbounds i16, i16* %y, i32 80 2646 %91 = bitcast i16* %90 to <8 x i16>* 2647 %wide.load11.10 = load <8 x i16>, <8 x i16>* %91, align 2 2648 %92 = sext <8 x i16> %wide.load11.10 to <8 x i32> 2649 %93 = mul nsw <8 x i32> %92, %89 2650 %94 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %93) 2651 %95 = add i32 %94, %86 2652 %96 = getelementptr inbounds i16, i16* %x, i32 88 2653 %97 = bitcast i16* %96 to <8 x i16>* 2654 %wide.load.11 = load <8 x i16>, <8 x i16>* %97, align 2 2655 %98 = sext <8 x i16> %wide.load.11 to <8 x i32> 2656 %99 = getelementptr inbounds i16, i16* %y, i32 88 2657 %100 = bitcast i16* %99 to <8 x i16>* 2658 %wide.load11.11 = load <8 x i16>, <8 x i16>* %100, align 2 2659 %101 = sext <8 x i16> %wide.load11.11 to <8 x i32> 2660 %102 = mul nsw <8 x i32> %101, %98 2661 %103 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %102) 2662 %104 = add i32 %103, %95 2663 %105 = getelementptr inbounds i16, i16* %x, i32 96 2664 %106 = bitcast i16* %105 to <8 x i16>* 2665 %wide.load.12 = load <8 x i16>, <8 x i16>* %106, align 2 2666 %107 = sext <8 x i16> %wide.load.12 to <8 x i32> 2667 %108 = getelementptr inbounds i16, i16* %y, i32 96 2668 %109 = bitcast i16* %108 to <8 x i16>* 2669 %wide.load11.12 = load <8 x i16>, <8 x i16>* %109, align 2 2670 %110 = sext <8 x i16> %wide.load11.12 to <8 x i32> 2671 %111 = mul nsw <8 x i32> %110, %107 2672 %112 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %111) 2673 %113 = add i32 %112, %104 2674 %114 = getelementptr inbounds i16, i16* %x, i32 104 2675 %115 = bitcast i16* %114 to <8 x i16>* 2676 %wide.load.13 = load <8 x i16>, <8 x i16>* %115, align 2 2677 %116 = sext <8 x i16> %wide.load.13 to <8 x i32> 2678 %117 = getelementptr inbounds i16, i16* %y, i32 104 2679 %118 = bitcast i16* %117 to <8 x i16>* 2680 %wide.load11.13 = load <8 x i16>, <8 x i16>* %118, align 2 2681 %119 = sext <8 x i16> %wide.load11.13 to <8 x i32> 2682 %120 = mul nsw <8 x i32> %119, %116 2683 %121 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %120) 2684 %122 = add i32 %121, %113 2685 %123 = getelementptr inbounds i16, i16* %x, i32 112 2686 %124 = bitcast i16* %123 to <8 x i16>* 2687 %wide.load.14 = load <8 x i16>, <8 x i16>* %124, align 2 2688 %125 = sext <8 x i16> %wide.load.14 to <8 x i32> 2689 %126 = getelementptr inbounds i16, i16* %y, i32 112 2690 %127 = bitcast i16* %126 to <8 x i16>* 2691 %wide.load11.14 = load <8 x i16>, <8 x i16>* %127, align 2 2692 %128 = sext <8 x i16> %wide.load11.14 to <8 x i32> 2693 %129 = mul nsw <8 x i32> %128, %125 2694 %130 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %129) 2695 %131 = add i32 %130, %122 2696 %132 = getelementptr inbounds i16, i16* %x, i32 120 2697 %133 = bitcast i16* %132 to <8 x i16>* 2698 %wide.load.15 = load <8 x i16>, <8 x i16>* %133, align 2 2699 %134 = sext <8 x i16> %wide.load.15 to <8 x i32> 2700 %135 = getelementptr inbounds i16, i16* %y, i32 120 2701 %136 = bitcast i16* %135 to <8 x i16>* 2702 %wide.load11.15 = load <8 x i16>, <8 x i16>* %136, align 2 2703 %137 = sext <8 x i16> %wide.load11.15 to <8 x i32> 2704 %138 = mul nsw <8 x i32> %137, %134 2705 %139 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %138) 2706 %140 = add i32 %139, %131 2707 ret i32 %140 2708} 2709 2710define i32 @mlav2i32i8(i8* %x, i8* %y) { 2711; CHECK-LABEL: mlav2i32i8: 2712; CHECK: @ %bb.0: @ %entry 2713; CHECK-NEXT: ldrb r2, [r0] 2714; CHECK-NEXT: ldrb r3, [r1] 2715; CHECK-NEXT: ldrb r0, [r0, #1] 2716; CHECK-NEXT: ldrb r1, [r1, #1] 2717; CHECK-NEXT: muls r0, r1, r0 2718; CHECK-NEXT: smlabb r0, r3, r2, r0 2719; CHECK-NEXT: bx lr 2720entry: 2721 %0 = load i8, i8* %x, align 1 2722 %conv = zext i8 %0 to i32 2723 %1 = load i8, i8* %y, align 1 2724 %conv2 = zext i8 %1 to i32 2725 %mul = mul nuw nsw i32 %conv2, %conv 2726 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1 2727 %2 = load i8, i8* %arrayidx.1, align 1 2728 %conv.1 = zext i8 %2 to i32 2729 %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1 2730 %3 = load i8, i8* %arrayidx1.1, align 1 2731 %conv2.1 = zext i8 %3 to i32 2732 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 2733 %add.1 = add nuw nsw i32 %mul.1, %mul 2734 ret i32 %add.1 2735} 2736 2737define i32 @mlav4i32i8(i8* %x, i8* %y) { 2738; CHECK-LABEL: mlav4i32i8: 2739; CHECK: @ %bb.0: @ %entry 2740; CHECK-NEXT: vldrb.u32 q0, [r0] 2741; CHECK-NEXT: vldrb.u32 q1, [r1] 2742; CHECK-NEXT: vmlav.u32 r0, q1, q0 2743; CHECK-NEXT: bx lr 2744entry: 2745 %0 = bitcast i8* %x to <4 x i8>* 2746 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2747 %2 = zext <4 x i8> %1 to <4 x i32> 2748 %3 = bitcast i8* %y to <4 x i8>* 2749 %4 = load <4 x i8>, <4 x i8>* %3, align 1 2750 %5 = zext <4 x i8> %4 to <4 x i32> 2751 %6 = mul nuw nsw <4 x i32> %5, %2 2752 %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6) 2753 ret i32 %7 2754} 2755 2756define i32 @mlav8i32i8(i8* %x, i8* %y) { 2757; CHECK-LABEL: mlav8i32i8: 2758; CHECK: @ %bb.0: @ %entry 2759; CHECK-NEXT: vldrb.u16 q0, [r0] 2760; CHECK-NEXT: vldrb.u16 q1, [r1] 2761; CHECK-NEXT: vmlav.u16 r0, q1, q0 2762; CHECK-NEXT: bx lr 2763entry: 2764 %0 = bitcast i8* %x to <8 x i8>* 2765 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2766 %2 = zext <8 x i8> %1 to <8 x i32> 2767 %3 = bitcast i8* %y to <8 x i8>* 2768 %4 = load <8 x i8>, <8 x i8>* %3, align 1 2769 %5 = zext <8 x i8> %4 to <8 x i32> 2770 %6 = mul nuw nsw <8 x i32> %5, %2 2771 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 2772 ret i32 %7 2773} 2774 2775define i32 @mlav16i32i8(i8* %x, i8* %y) { 2776; CHECK-LABEL: mlav16i32i8: 2777; CHECK: @ %bb.0: @ %entry 2778; CHECK-NEXT: vldrb.u8 q0, [r0] 2779; CHECK-NEXT: vldrb.u8 q1, [r1] 2780; CHECK-NEXT: vmlav.u8 r0, q1, q0 2781; CHECK-NEXT: bx lr 2782entry: 2783 %0 = bitcast i8* %x to <16 x i8>* 2784 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2785 %2 = zext <16 x i8> %1 to <16 x i32> 2786 %3 = bitcast i8* %y to <16 x i8>* 2787 %4 = load <16 x i8>, <16 x i8>* %3, align 1 2788 %5 = zext <16 x i8> %4 to <16 x i32> 2789 %6 = mul nuw nsw <16 x i32> %5, %2 2790 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) 2791 ret i32 %7 2792} 2793 2794define i32 @mlav24i32i8(i8* %x, i8* %y) { 2795; CHECK-LABEL: mlav24i32i8: 2796; CHECK: @ %bb.0: @ %entry 2797; CHECK-NEXT: vldrb.u16 q0, [r0] 2798; CHECK-NEXT: vldrb.u16 q1, [r1] 2799; CHECK-NEXT: vmlav.u16 r2, q1, q0 2800; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 2801; CHECK-NEXT: vldrb.u8 q1, [r1, #8] 2802; CHECK-NEXT: vmlava.u8 r2, q1, q0 2803; CHECK-NEXT: mov r0, r2 2804; CHECK-NEXT: bx lr 2805entry: 2806 %0 = bitcast i8* %x to <8 x i8>* 2807 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2808 %2 = zext <8 x i8> %1 to <8 x i32> 2809 %3 = bitcast i8* %y to <8 x i8>* 2810 %4 = load <8 x i8>, <8 x i8>* %3, align 1 2811 %5 = zext <8 x i8> %4 to <8 x i32> 2812 %6 = mul nuw nsw <8 x i32> %5, %2 2813 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8 2814 %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8 2815 %7 = bitcast i8* %arrayidx.8 to <16 x i8>* 2816 %8 = load <16 x i8>, <16 x i8>* %7, align 1 2817 %9 = zext <16 x i8> %8 to <16 x i32> 2818 %10 = bitcast i8* %arrayidx1.8 to <16 x i8>* 2819 %11 = load <16 x i8>, <16 x i8>* %10, align 1 2820 %12 = zext <16 x i8> %11 to <16 x i32> 2821 %13 = mul nuw nsw <16 x i32> %12, %9 2822 %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13) 2823 %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) 2824 %op.rdx = add nuw nsw i32 %14, %15 2825 ret i32 %op.rdx 2826} 2827 2828define i32 @mlav32i32i8(i8* %x, i8* %y) { 2829; CHECK-LABEL: mlav32i32i8: 2830; CHECK: @ %bb.0: @ %entry 2831; CHECK-NEXT: vldrb.u32 q0, [r0] 2832; CHECK-NEXT: vldrb.u32 q1, [r1] 2833; CHECK-NEXT: mov r2, r0 2834; CHECK-NEXT: vmlav.u32 r0, q1, q0 2835; CHECK-NEXT: vldrb.u32 q0, [r2, #4] 2836; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 2837; CHECK-NEXT: vmlava.u32 r0, q1, q0 2838; CHECK-NEXT: vldrb.u32 q0, [r2, #8] 2839; CHECK-NEXT: vldrb.u32 q1, [r1, #8] 2840; CHECK-NEXT: vmlava.u32 r0, q1, q0 2841; CHECK-NEXT: vldrb.u32 q0, [r2, #12] 2842; CHECK-NEXT: vldrb.u32 q1, [r1, #12] 2843; CHECK-NEXT: vmlava.u32 r0, q1, q0 2844; CHECK-NEXT: vldrb.u32 q0, [r2, #16] 2845; CHECK-NEXT: vldrb.u32 q1, [r1, #16] 2846; CHECK-NEXT: vmlava.u32 r0, q1, q0 2847; CHECK-NEXT: vldrb.u32 q0, [r2, #20] 2848; CHECK-NEXT: vldrb.u32 q1, [r1, #20] 2849; CHECK-NEXT: vmlava.u32 r0, q1, q0 2850; CHECK-NEXT: vldrb.u32 q0, [r2, #24] 2851; CHECK-NEXT: vldrb.u32 q1, [r1, #24] 2852; CHECK-NEXT: vmlava.u32 r0, q1, q0 2853; CHECK-NEXT: vldrb.u32 q0, [r2, #28] 2854; CHECK-NEXT: vldrb.u32 q1, [r1, #28] 2855; CHECK-NEXT: vmlava.u32 r0, q1, q0 2856; CHECK-NEXT: bx lr 2857entry: 2858 %0 = bitcast i8* %x to <32 x i8>* 2859 %1 = load <32 x i8>, <32 x i8>* %0, align 1 2860 %2 = zext <32 x i8> %1 to <32 x i32> 2861 %3 = bitcast i8* %y to <32 x i8>* 2862 %4 = load <32 x i8>, <32 x i8>* %3, align 1 2863 %5 = zext <32 x i8> %4 to <32 x i32> 2864 %6 = mul nuw nsw <32 x i32> %5, %2 2865 %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6) 2866 ret i32 %7 2867} 2868 2869define i32 @mlav64i32i8(i8* %x, i8* %y) { 2870; CHECK-LABEL: mlav64i32i8: 2871; CHECK: @ %bb.0: @ %entry 2872; CHECK-NEXT: vldrb.u8 q0, [r0] 2873; CHECK-NEXT: vldrb.u8 q1, [r1] 2874; CHECK-NEXT: vmlav.u8 r2, q1, q0 2875; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 2876; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 2877; CHECK-NEXT: vmlava.u8 r2, q1, q0 2878; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 2879; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 2880; CHECK-NEXT: vmlava.u8 r2, q1, q0 2881; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 2882; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 2883; CHECK-NEXT: vmlava.u8 r2, q1, q0 2884; CHECK-NEXT: mov r0, r2 2885; CHECK-NEXT: bx lr 2886entry: 2887 %0 = bitcast i8* %x to <16 x i8>* 2888 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 2889 %1 = zext <16 x i8> %wide.load to <16 x i32> 2890 %2 = bitcast i8* %y to <16 x i8>* 2891 %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1 2892 %3 = zext <16 x i8> %wide.load11 to <16 x i32> 2893 %4 = mul nuw nsw <16 x i32> %3, %1 2894 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 2895 %6 = getelementptr inbounds i8, i8* %x, i32 16 2896 %7 = bitcast i8* %6 to <16 x i8>* 2897 %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1 2898 %8 = zext <16 x i8> %wide.load.1 to <16 x i32> 2899 %9 = getelementptr inbounds i8, i8* %y, i32 16 2900 %10 = bitcast i8* %9 to <16 x i8>* 2901 %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1 2902 %11 = zext <16 x i8> %wide.load11.1 to <16 x i32> 2903 %12 = mul nuw nsw <16 x i32> %11, %8 2904 %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12) 2905 %14 = add i32 %13, %5 2906 %15 = getelementptr inbounds i8, i8* %x, i32 32 2907 %16 = bitcast i8* %15 to <16 x i8>* 2908 %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1 2909 %17 = zext <16 x i8> %wide.load.2 to <16 x i32> 2910 %18 = getelementptr inbounds i8, i8* %y, i32 32 2911 %19 = bitcast i8* %18 to <16 x i8>* 2912 %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1 2913 %20 = zext <16 x i8> %wide.load11.2 to <16 x i32> 2914 %21 = mul nuw nsw <16 x i32> %20, %17 2915 %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21) 2916 %23 = add i32 %22, %14 2917 %24 = getelementptr inbounds i8, i8* %x, i32 48 2918 %25 = bitcast i8* %24 to <16 x i8>* 2919 %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1 2920 %26 = zext <16 x i8> %wide.load.3 to <16 x i32> 2921 %27 = getelementptr inbounds i8, i8* %y, i32 48 2922 %28 = bitcast i8* %27 to <16 x i8>* 2923 %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1 2924 %29 = zext <16 x i8> %wide.load11.3 to <16 x i32> 2925 %30 = mul nuw nsw <16 x i32> %29, %26 2926 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30) 2927 %32 = add i32 %31, %23 2928 ret i32 %32 2929} 2930 2931define i32 @mlav128i32i8(i8* %x, i8* %y) { 2932; CHECK-LABEL: mlav128i32i8: 2933; CHECK: @ %bb.0: @ %entry 2934; CHECK-NEXT: vldrb.u8 q0, [r0] 2935; CHECK-NEXT: vldrb.u8 q1, [r1] 2936; CHECK-NEXT: mov r2, r0 2937; CHECK-NEXT: vmlav.u8 r0, q1, q0 2938; CHECK-NEXT: vldrb.u8 q0, [r2, #16] 2939; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 2940; CHECK-NEXT: vmlava.u8 r0, q1, q0 2941; CHECK-NEXT: vldrb.u8 q0, [r2, #32] 2942; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 2943; CHECK-NEXT: vmlava.u8 r0, q1, q0 2944; CHECK-NEXT: vldrb.u8 q0, [r2, #48] 2945; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 2946; CHECK-NEXT: vmlava.u8 r0, q1, q0 2947; CHECK-NEXT: vldrb.u8 q0, [r2, #64] 2948; CHECK-NEXT: vldrb.u8 q1, [r1, #64] 2949; CHECK-NEXT: vmlava.u8 r0, q1, q0 2950; CHECK-NEXT: vldrb.u8 q0, [r2, #80] 2951; CHECK-NEXT: vldrb.u8 q1, [r1, #80] 2952; CHECK-NEXT: vmlava.u8 r0, q1, q0 2953; CHECK-NEXT: vldrb.u8 q0, [r2, #96] 2954; CHECK-NEXT: vldrb.u8 q1, [r1, #96] 2955; CHECK-NEXT: vmlava.u8 r0, q1, q0 2956; CHECK-NEXT: vldrb.u8 q0, [r2, #112] 2957; CHECK-NEXT: vldrb.u8 q1, [r1, #112] 2958; CHECK-NEXT: vmlava.u8 r0, q1, q0 2959; CHECK-NEXT: bx lr 2960entry: 2961 %0 = bitcast i8* %x to <16 x i8>* 2962 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 2963 %1 = zext <16 x i8> %wide.load to <16 x i32> 2964 %2 = bitcast i8* %y to <16 x i8>* 2965 %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1 2966 %3 = zext <16 x i8> %wide.load11 to <16 x i32> 2967 %4 = mul nuw nsw <16 x i32> %3, %1 2968 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 2969 %6 = getelementptr inbounds i8, i8* %x, i32 16 2970 %7 = bitcast i8* %6 to <16 x i8>* 2971 %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1 2972 %8 = zext <16 x i8> %wide.load.1 to <16 x i32> 2973 %9 = getelementptr inbounds i8, i8* %y, i32 16 2974 %10 = bitcast i8* %9 to <16 x i8>* 2975 %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1 2976 %11 = zext <16 x i8> %wide.load11.1 to <16 x i32> 2977 %12 = mul nuw nsw <16 x i32> %11, %8 2978 %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12) 2979 %14 = add i32 %13, %5 2980 %15 = getelementptr inbounds i8, i8* %x, i32 32 2981 %16 = bitcast i8* %15 to <16 x i8>* 2982 %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1 2983 %17 = zext <16 x i8> %wide.load.2 to <16 x i32> 2984 %18 = getelementptr inbounds i8, i8* %y, i32 32 2985 %19 = bitcast i8* %18 to <16 x i8>* 2986 %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1 2987 %20 = zext <16 x i8> %wide.load11.2 to <16 x i32> 2988 %21 = mul nuw nsw <16 x i32> %20, %17 2989 %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21) 2990 %23 = add i32 %22, %14 2991 %24 = getelementptr inbounds i8, i8* %x, i32 48 2992 %25 = bitcast i8* %24 to <16 x i8>* 2993 %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1 2994 %26 = zext <16 x i8> %wide.load.3 to <16 x i32> 2995 %27 = getelementptr inbounds i8, i8* %y, i32 48 2996 %28 = bitcast i8* %27 to <16 x i8>* 2997 %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1 2998 %29 = zext <16 x i8> %wide.load11.3 to <16 x i32> 2999 %30 = mul nuw nsw <16 x i32> %29, %26 3000 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30) 3001 %32 = add i32 %31, %23 3002 %33 = getelementptr inbounds i8, i8* %x, i32 64 3003 %34 = bitcast i8* %33 to <16 x i8>* 3004 %wide.load.4 = load <16 x i8>, <16 x i8>* %34, align 1 3005 %35 = zext <16 x i8> %wide.load.4 to <16 x i32> 3006 %36 = getelementptr inbounds i8, i8* %y, i32 64 3007 %37 = bitcast i8* %36 to <16 x i8>* 3008 %wide.load11.4 = load <16 x i8>, <16 x i8>* %37, align 1 3009 %38 = zext <16 x i8> %wide.load11.4 to <16 x i32> 3010 %39 = mul nuw nsw <16 x i32> %38, %35 3011 %40 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %39) 3012 %41 = add i32 %40, %32 3013 %42 = getelementptr inbounds i8, i8* %x, i32 80 3014 %43 = bitcast i8* %42 to <16 x i8>* 3015 %wide.load.5 = load <16 x i8>, <16 x i8>* %43, align 1 3016 %44 = zext <16 x i8> %wide.load.5 to <16 x i32> 3017 %45 = getelementptr inbounds i8, i8* %y, i32 80 3018 %46 = bitcast i8* %45 to <16 x i8>* 3019 %wide.load11.5 = load <16 x i8>, <16 x i8>* %46, align 1 3020 %47 = zext <16 x i8> %wide.load11.5 to <16 x i32> 3021 %48 = mul nuw nsw <16 x i32> %47, %44 3022 %49 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %48) 3023 %50 = add i32 %49, %41 3024 %51 = getelementptr inbounds i8, i8* %x, i32 96 3025 %52 = bitcast i8* %51 to <16 x i8>* 3026 %wide.load.6 = load <16 x i8>, <16 x i8>* %52, align 1 3027 %53 = zext <16 x i8> %wide.load.6 to <16 x i32> 3028 %54 = getelementptr inbounds i8, i8* %y, i32 96 3029 %55 = bitcast i8* %54 to <16 x i8>* 3030 %wide.load11.6 = load <16 x i8>, <16 x i8>* %55, align 1 3031 %56 = zext <16 x i8> %wide.load11.6 to <16 x i32> 3032 %57 = mul nuw nsw <16 x i32> %56, %53 3033 %58 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %57) 3034 %59 = add i32 %58, %50 3035 %60 = getelementptr inbounds i8, i8* %x, i32 112 3036 %61 = bitcast i8* %60 to <16 x i8>* 3037 %wide.load.7 = load <16 x i8>, <16 x i8>* %61, align 1 3038 %62 = zext <16 x i8> %wide.load.7 to <16 x i32> 3039 %63 = getelementptr inbounds i8, i8* %y, i32 112 3040 %64 = bitcast i8* %63 to <16 x i8>* 3041 %wide.load11.7 = load <16 x i8>, <16 x i8>* %64, align 1 3042 %65 = zext <16 x i8> %wide.load11.7 to <16 x i32> 3043 %66 = mul nuw nsw <16 x i32> %65, %62 3044 %67 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %66) 3045 %68 = add i32 %67, %59 3046 ret i32 %68 3047} 3048 3049define signext i16 @mlav2i16i16(i16* %x, i16* %y) { 3050; CHECK-LABEL: mlav2i16i16: 3051; CHECK: @ %bb.0: @ %entry 3052; CHECK-NEXT: ldrh r2, [r0] 3053; CHECK-NEXT: ldrh r3, [r1] 3054; CHECK-NEXT: ldrh r0, [r0, #2] 3055; CHECK-NEXT: ldrh r1, [r1, #2] 3056; CHECK-NEXT: muls r2, r3, r2 3057; CHECK-NEXT: mla r0, r1, r0, r2 3058; CHECK-NEXT: sxth r0, r0 3059; CHECK-NEXT: bx lr 3060entry: 3061 %0 = load i16, i16* %x, align 2 3062 %1 = load i16, i16* %y, align 2 3063 %mul = mul i16 %1, %0 3064 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1 3065 %2 = load i16, i16* %arrayidx.1, align 2 3066 %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1 3067 %3 = load i16, i16* %arrayidx1.1, align 2 3068 %mul.1 = mul i16 %3, %2 3069 %add.1 = add i16 %mul.1, %mul 3070 ret i16 %add.1 3071} 3072 3073define signext i16 @mlav4i16i16(i16* %x, i16* %y) { 3074; CHECK-LABEL: mlav4i16i16: 3075; CHECK: @ %bb.0: @ %entry 3076; CHECK-NEXT: vldrh.u32 q0, [r0] 3077; CHECK-NEXT: vldrh.u32 q1, [r1] 3078; CHECK-NEXT: vmlav.u32 r0, q1, q0 3079; CHECK-NEXT: sxth r0, r0 3080; CHECK-NEXT: bx lr 3081entry: 3082 %0 = bitcast i16* %x to <4 x i16>* 3083 %1 = load <4 x i16>, <4 x i16>* %0, align 2 3084 %2 = bitcast i16* %y to <4 x i16>* 3085 %3 = load <4 x i16>, <4 x i16>* %2, align 2 3086 %4 = mul <4 x i16> %3, %1 3087 %5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %4) 3088 ret i16 %5 3089} 3090 3091define signext i16 @mlav8i16i16(i16* %x, i16* %y) { 3092; CHECK-LABEL: mlav8i16i16: 3093; CHECK: @ %bb.0: @ %entry 3094; CHECK-NEXT: vldrh.u16 q0, [r0] 3095; CHECK-NEXT: vldrh.u16 q1, [r1] 3096; CHECK-NEXT: vmlav.u16 r0, q1, q0 3097; CHECK-NEXT: sxth r0, r0 3098; CHECK-NEXT: bx lr 3099entry: 3100 %0 = bitcast i16* %x to <8 x i16>* 3101 %1 = load <8 x i16>, <8 x i16>* %0, align 2 3102 %2 = bitcast i16* %y to <8 x i16>* 3103 %3 = load <8 x i16>, <8 x i16>* %2, align 2 3104 %4 = mul <8 x i16> %3, %1 3105 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) 3106 ret i16 %5 3107} 3108 3109define signext i16 @mlav16i16i16(i16* %x, i16* %y) { 3110; CHECK-LABEL: mlav16i16i16: 3111; CHECK: @ %bb.0: @ %entry 3112; CHECK-NEXT: vldrh.u16 q0, [r0] 3113; CHECK-NEXT: vldrh.u16 q1, [r1] 3114; CHECK-NEXT: vmlav.u16 r2, q1, q0 3115; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 3116; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 3117; CHECK-NEXT: vmlava.u16 r2, q1, q0 3118; CHECK-NEXT: sxth r0, r2 3119; CHECK-NEXT: bx lr 3120entry: 3121 %0 = bitcast i16* %x to <16 x i16>* 3122 %1 = load <16 x i16>, <16 x i16>* %0, align 2 3123 %2 = bitcast i16* %y to <16 x i16>* 3124 %3 = load <16 x i16>, <16 x i16>* %2, align 2 3125 %4 = mul <16 x i16> %3, %1 3126 %5 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %4) 3127 ret i16 %5 3128} 3129 3130define signext i16 @mlav24i16i16(i16* %x, i16* %y) { 3131; CHECK-LABEL: mlav24i16i16: 3132; CHECK: @ %bb.0: @ %entry 3133; CHECK-NEXT: vldrh.u16 q0, [r0] 3134; CHECK-NEXT: vldrh.u16 q1, [r1] 3135; CHECK-NEXT: vmlav.u16 r2, q1, q0 3136; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 3137; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 3138; CHECK-NEXT: vmlava.u16 r2, q1, q0 3139; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 3140; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 3141; CHECK-NEXT: vmlava.u16 r2, q1, q0 3142; CHECK-NEXT: sxth r0, r2 3143; CHECK-NEXT: bx lr 3144entry: 3145 %0 = bitcast i16* %x to <8 x i16>* 3146 %1 = load <8 x i16>, <8 x i16>* %0, align 2 3147 %2 = bitcast i16* %y to <8 x i16>* 3148 %3 = load <8 x i16>, <8 x i16>* %2, align 2 3149 %4 = mul <8 x i16> %3, %1 3150 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8 3151 %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8 3152 %5 = bitcast i16* %arrayidx.8 to <16 x i16>* 3153 %6 = load <16 x i16>, <16 x i16>* %5, align 2 3154 %7 = bitcast i16* %arrayidx1.8 to <16 x i16>* 3155 %8 = load <16 x i16>, <16 x i16>* %7, align 2 3156 %9 = mul <16 x i16> %8, %6 3157 %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %9) 3158 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) 3159 %op.rdx = add i16 %10, %11 3160 ret i16 %op.rdx 3161} 3162 3163define signext i16 @mlav32i16i16(i16* %x, i16* %y) { 3164; CHECK-LABEL: mlav32i16i16: 3165; CHECK: @ %bb.0: @ %entry 3166; CHECK-NEXT: vldrh.u16 q0, [r0] 3167; CHECK-NEXT: vldrh.u16 q1, [r1] 3168; CHECK-NEXT: vmlav.u16 r2, q1, q0 3169; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 3170; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 3171; CHECK-NEXT: vmlava.u16 r2, q1, q0 3172; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 3173; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 3174; CHECK-NEXT: vmlava.u16 r2, q1, q0 3175; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 3176; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 3177; CHECK-NEXT: vmlava.u16 r2, q1, q0 3178; CHECK-NEXT: sxth r0, r2 3179; CHECK-NEXT: bx lr 3180entry: 3181 %0 = bitcast i16* %x to <32 x i16>* 3182 %1 = load <32 x i16>, <32 x i16>* %0, align 2 3183 %2 = bitcast i16* %y to <32 x i16>* 3184 %3 = load <32 x i16>, <32 x i16>* %2, align 2 3185 %4 = mul <32 x i16> %3, %1 3186 %5 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %4) 3187 ret i16 %5 3188} 3189 3190define signext i16 @mlav64i16i16(i16* %x, i16* %y) { 3191; CHECK-LABEL: mlav64i16i16: 3192; CHECK: @ %bb.0: @ %entry 3193; CHECK-NEXT: vldrh.u16 q0, [r0] 3194; CHECK-NEXT: vldrh.u16 q1, [r1] 3195; CHECK-NEXT: vmlav.u16 r2, q1, q0 3196; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 3197; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 3198; CHECK-NEXT: vmlava.u16 r2, q1, q0 3199; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 3200; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 3201; CHECK-NEXT: vmlava.u16 r2, q1, q0 3202; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 3203; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 3204; CHECK-NEXT: vmlava.u16 r2, q1, q0 3205; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 3206; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 3207; CHECK-NEXT: vmlava.u16 r2, q1, q0 3208; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 3209; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 3210; CHECK-NEXT: vmlava.u16 r2, q1, q0 3211; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 3212; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 3213; CHECK-NEXT: vmlava.u16 r2, q1, q0 3214; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 3215; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 3216; CHECK-NEXT: vmlava.u16 r2, q1, q0 3217; CHECK-NEXT: sxth r0, r2 3218; CHECK-NEXT: bx lr 3219entry: 3220 %0 = bitcast i16* %x to <8 x i16>* 3221 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 3222 %1 = bitcast i16* %y to <8 x i16>* 3223 %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2 3224 %2 = mul <8 x i16> %wide.load13, %wide.load 3225 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 3226 %4 = getelementptr inbounds i16, i16* %x, i32 8 3227 %5 = bitcast i16* %4 to <8 x i16>* 3228 %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2 3229 %6 = getelementptr inbounds i16, i16* %y, i32 8 3230 %7 = bitcast i16* %6 to <8 x i16>* 3231 %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2 3232 %8 = mul <8 x i16> %wide.load13.1, %wide.load.1 3233 %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8) 3234 %10 = add i16 %9, %3 3235 %11 = getelementptr inbounds i16, i16* %x, i32 16 3236 %12 = bitcast i16* %11 to <8 x i16>* 3237 %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2 3238 %13 = getelementptr inbounds i16, i16* %y, i32 16 3239 %14 = bitcast i16* %13 to <8 x i16>* 3240 %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2 3241 %15 = mul <8 x i16> %wide.load13.2, %wide.load.2 3242 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15) 3243 %17 = add i16 %16, %10 3244 %18 = getelementptr inbounds i16, i16* %x, i32 24 3245 %19 = bitcast i16* %18 to <8 x i16>* 3246 %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2 3247 %20 = getelementptr inbounds i16, i16* %y, i32 24 3248 %21 = bitcast i16* %20 to <8 x i16>* 3249 %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2 3250 %22 = mul <8 x i16> %wide.load13.3, %wide.load.3 3251 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22) 3252 %24 = add i16 %23, %17 3253 %25 = getelementptr inbounds i16, i16* %x, i32 32 3254 %26 = bitcast i16* %25 to <8 x i16>* 3255 %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2 3256 %27 = getelementptr inbounds i16, i16* %y, i32 32 3257 %28 = bitcast i16* %27 to <8 x i16>* 3258 %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2 3259 %29 = mul <8 x i16> %wide.load13.4, %wide.load.4 3260 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) 3261 %31 = add i16 %30, %24 3262 %32 = getelementptr inbounds i16, i16* %x, i32 40 3263 %33 = bitcast i16* %32 to <8 x i16>* 3264 %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2 3265 %34 = getelementptr inbounds i16, i16* %y, i32 40 3266 %35 = bitcast i16* %34 to <8 x i16>* 3267 %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2 3268 %36 = mul <8 x i16> %wide.load13.5, %wide.load.5 3269 %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36) 3270 %38 = add i16 %37, %31 3271 %39 = getelementptr inbounds i16, i16* %x, i32 48 3272 %40 = bitcast i16* %39 to <8 x i16>* 3273 %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2 3274 %41 = getelementptr inbounds i16, i16* %y, i32 48 3275 %42 = bitcast i16* %41 to <8 x i16>* 3276 %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2 3277 %43 = mul <8 x i16> %wide.load13.6, %wide.load.6 3278 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43) 3279 %45 = add i16 %44, %38 3280 %46 = getelementptr inbounds i16, i16* %x, i32 56 3281 %47 = bitcast i16* %46 to <8 x i16>* 3282 %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2 3283 %48 = getelementptr inbounds i16, i16* %y, i32 56 3284 %49 = bitcast i16* %48 to <8 x i16>* 3285 %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2 3286 %50 = mul <8 x i16> %wide.load13.7, %wide.load.7 3287 %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50) 3288 %52 = add i16 %51, %45 3289 ret i16 %52 3290} 3291 3292define signext i16 @mlav128i16i16(i16* %x, i16* %y) { 3293; CHECK-LABEL: mlav128i16i16: 3294; CHECK: @ %bb.0: @ %entry 3295; CHECK-NEXT: vldrh.u16 q0, [r0] 3296; CHECK-NEXT: vldrh.u16 q1, [r1] 3297; CHECK-NEXT: vmlav.u16 r2, q1, q0 3298; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 3299; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 3300; CHECK-NEXT: vmlava.u16 r2, q1, q0 3301; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 3302; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 3303; CHECK-NEXT: vmlava.u16 r2, q1, q0 3304; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 3305; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 3306; CHECK-NEXT: vmlava.u16 r2, q1, q0 3307; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 3308; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 3309; CHECK-NEXT: vmlava.u16 r2, q1, q0 3310; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 3311; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 3312; CHECK-NEXT: vmlava.u16 r2, q1, q0 3313; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 3314; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 3315; CHECK-NEXT: vmlava.u16 r2, q1, q0 3316; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 3317; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 3318; CHECK-NEXT: vmlava.u16 r2, q1, q0 3319; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 3320; CHECK-NEXT: vldrh.u16 q1, [r1, #128] 3321; CHECK-NEXT: vmlava.u16 r2, q1, q0 3322; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 3323; CHECK-NEXT: vldrh.u16 q1, [r1, #144] 3324; CHECK-NEXT: vmlava.u16 r2, q1, q0 3325; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 3326; CHECK-NEXT: vldrh.u16 q1, [r1, #160] 3327; CHECK-NEXT: vmlava.u16 r2, q1, q0 3328; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 3329; CHECK-NEXT: vldrh.u16 q1, [r1, #176] 3330; CHECK-NEXT: vmlava.u16 r2, q1, q0 3331; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 3332; CHECK-NEXT: vldrh.u16 q1, [r1, #192] 3333; CHECK-NEXT: vmlava.u16 r2, q1, q0 3334; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 3335; CHECK-NEXT: vldrh.u16 q1, [r1, #208] 3336; CHECK-NEXT: vmlava.u16 r2, q1, q0 3337; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 3338; CHECK-NEXT: vldrh.u16 q1, [r1, #224] 3339; CHECK-NEXT: vmlava.u16 r2, q1, q0 3340; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 3341; CHECK-NEXT: vldrh.u16 q1, [r1, #240] 3342; CHECK-NEXT: vmlava.u16 r2, q1, q0 3343; CHECK-NEXT: sxth r0, r2 3344; CHECK-NEXT: bx lr 3345entry: 3346 %0 = bitcast i16* %x to <8 x i16>* 3347 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2 3348 %1 = bitcast i16* %y to <8 x i16>* 3349 %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2 3350 %2 = mul <8 x i16> %wide.load13, %wide.load 3351 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 3352 %4 = getelementptr inbounds i16, i16* %x, i32 8 3353 %5 = bitcast i16* %4 to <8 x i16>* 3354 %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2 3355 %6 = getelementptr inbounds i16, i16* %y, i32 8 3356 %7 = bitcast i16* %6 to <8 x i16>* 3357 %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2 3358 %8 = mul <8 x i16> %wide.load13.1, %wide.load.1 3359 %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8) 3360 %10 = add i16 %9, %3 3361 %11 = getelementptr inbounds i16, i16* %x, i32 16 3362 %12 = bitcast i16* %11 to <8 x i16>* 3363 %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2 3364 %13 = getelementptr inbounds i16, i16* %y, i32 16 3365 %14 = bitcast i16* %13 to <8 x i16>* 3366 %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2 3367 %15 = mul <8 x i16> %wide.load13.2, %wide.load.2 3368 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15) 3369 %17 = add i16 %16, %10 3370 %18 = getelementptr inbounds i16, i16* %x, i32 24 3371 %19 = bitcast i16* %18 to <8 x i16>* 3372 %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2 3373 %20 = getelementptr inbounds i16, i16* %y, i32 24 3374 %21 = bitcast i16* %20 to <8 x i16>* 3375 %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2 3376 %22 = mul <8 x i16> %wide.load13.3, %wide.load.3 3377 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22) 3378 %24 = add i16 %23, %17 3379 %25 = getelementptr inbounds i16, i16* %x, i32 32 3380 %26 = bitcast i16* %25 to <8 x i16>* 3381 %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2 3382 %27 = getelementptr inbounds i16, i16* %y, i32 32 3383 %28 = bitcast i16* %27 to <8 x i16>* 3384 %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2 3385 %29 = mul <8 x i16> %wide.load13.4, %wide.load.4 3386 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) 3387 %31 = add i16 %30, %24 3388 %32 = getelementptr inbounds i16, i16* %x, i32 40 3389 %33 = bitcast i16* %32 to <8 x i16>* 3390 %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2 3391 %34 = getelementptr inbounds i16, i16* %y, i32 40 3392 %35 = bitcast i16* %34 to <8 x i16>* 3393 %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2 3394 %36 = mul <8 x i16> %wide.load13.5, %wide.load.5 3395 %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36) 3396 %38 = add i16 %37, %31 3397 %39 = getelementptr inbounds i16, i16* %x, i32 48 3398 %40 = bitcast i16* %39 to <8 x i16>* 3399 %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2 3400 %41 = getelementptr inbounds i16, i16* %y, i32 48 3401 %42 = bitcast i16* %41 to <8 x i16>* 3402 %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2 3403 %43 = mul <8 x i16> %wide.load13.6, %wide.load.6 3404 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43) 3405 %45 = add i16 %44, %38 3406 %46 = getelementptr inbounds i16, i16* %x, i32 56 3407 %47 = bitcast i16* %46 to <8 x i16>* 3408 %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2 3409 %48 = getelementptr inbounds i16, i16* %y, i32 56 3410 %49 = bitcast i16* %48 to <8 x i16>* 3411 %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2 3412 %50 = mul <8 x i16> %wide.load13.7, %wide.load.7 3413 %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50) 3414 %52 = add i16 %51, %45 3415 %53 = getelementptr inbounds i16, i16* %x, i32 64 3416 %54 = bitcast i16* %53 to <8 x i16>* 3417 %wide.load.8 = load <8 x i16>, <8 x i16>* %54, align 2 3418 %55 = getelementptr inbounds i16, i16* %y, i32 64 3419 %56 = bitcast i16* %55 to <8 x i16>* 3420 %wide.load13.8 = load <8 x i16>, <8 x i16>* %56, align 2 3421 %57 = mul <8 x i16> %wide.load13.8, %wide.load.8 3422 %58 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %57) 3423 %59 = add i16 %58, %52 3424 %60 = getelementptr inbounds i16, i16* %x, i32 72 3425 %61 = bitcast i16* %60 to <8 x i16>* 3426 %wide.load.9 = load <8 x i16>, <8 x i16>* %61, align 2 3427 %62 = getelementptr inbounds i16, i16* %y, i32 72 3428 %63 = bitcast i16* %62 to <8 x i16>* 3429 %wide.load13.9 = load <8 x i16>, <8 x i16>* %63, align 2 3430 %64 = mul <8 x i16> %wide.load13.9, %wide.load.9 3431 %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64) 3432 %66 = add i16 %65, %59 3433 %67 = getelementptr inbounds i16, i16* %x, i32 80 3434 %68 = bitcast i16* %67 to <8 x i16>* 3435 %wide.load.10 = load <8 x i16>, <8 x i16>* %68, align 2 3436 %69 = getelementptr inbounds i16, i16* %y, i32 80 3437 %70 = bitcast i16* %69 to <8 x i16>* 3438 %wide.load13.10 = load <8 x i16>, <8 x i16>* %70, align 2 3439 %71 = mul <8 x i16> %wide.load13.10, %wide.load.10 3440 %72 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %71) 3441 %73 = add i16 %72, %66 3442 %74 = getelementptr inbounds i16, i16* %x, i32 88 3443 %75 = bitcast i16* %74 to <8 x i16>* 3444 %wide.load.11 = load <8 x i16>, <8 x i16>* %75, align 2 3445 %76 = getelementptr inbounds i16, i16* %y, i32 88 3446 %77 = bitcast i16* %76 to <8 x i16>* 3447 %wide.load13.11 = load <8 x i16>, <8 x i16>* %77, align 2 3448 %78 = mul <8 x i16> %wide.load13.11, %wide.load.11 3449 %79 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %78) 3450 %80 = add i16 %79, %73 3451 %81 = getelementptr inbounds i16, i16* %x, i32 96 3452 %82 = bitcast i16* %81 to <8 x i16>* 3453 %wide.load.12 = load <8 x i16>, <8 x i16>* %82, align 2 3454 %83 = getelementptr inbounds i16, i16* %y, i32 96 3455 %84 = bitcast i16* %83 to <8 x i16>* 3456 %wide.load13.12 = load <8 x i16>, <8 x i16>* %84, align 2 3457 %85 = mul <8 x i16> %wide.load13.12, %wide.load.12 3458 %86 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %85) 3459 %87 = add i16 %86, %80 3460 %88 = getelementptr inbounds i16, i16* %x, i32 104 3461 %89 = bitcast i16* %88 to <8 x i16>* 3462 %wide.load.13 = load <8 x i16>, <8 x i16>* %89, align 2 3463 %90 = getelementptr inbounds i16, i16* %y, i32 104 3464 %91 = bitcast i16* %90 to <8 x i16>* 3465 %wide.load13.13 = load <8 x i16>, <8 x i16>* %91, align 2 3466 %92 = mul <8 x i16> %wide.load13.13, %wide.load.13 3467 %93 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %92) 3468 %94 = add i16 %93, %87 3469 %95 = getelementptr inbounds i16, i16* %x, i32 112 3470 %96 = bitcast i16* %95 to <8 x i16>* 3471 %wide.load.14 = load <8 x i16>, <8 x i16>* %96, align 2 3472 %97 = getelementptr inbounds i16, i16* %y, i32 112 3473 %98 = bitcast i16* %97 to <8 x i16>* 3474 %wide.load13.14 = load <8 x i16>, <8 x i16>* %98, align 2 3475 %99 = mul <8 x i16> %wide.load13.14, %wide.load.14 3476 %100 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %99) 3477 %101 = add i16 %100, %94 3478 %102 = getelementptr inbounds i16, i16* %x, i32 120 3479 %103 = bitcast i16* %102 to <8 x i16>* 3480 %wide.load.15 = load <8 x i16>, <8 x i16>* %103, align 2 3481 %104 = getelementptr inbounds i16, i16* %y, i32 120 3482 %105 = bitcast i16* %104 to <8 x i16>* 3483 %wide.load13.15 = load <8 x i16>, <8 x i16>* %105, align 2 3484 %106 = mul <8 x i16> %wide.load13.15, %wide.load.15 3485 %107 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %106) 3486 %108 = add i16 %107, %101 3487 ret i16 %108 3488} 3489 3490define zeroext i8 @mlav2i8i8(i8* %x, i8* %y) { 3491; CHECK-LABEL: mlav2i8i8: 3492; CHECK: @ %bb.0: @ %entry 3493; CHECK-NEXT: ldrb r2, [r0] 3494; CHECK-NEXT: ldrb r3, [r1] 3495; CHECK-NEXT: ldrb r0, [r0, #1] 3496; CHECK-NEXT: ldrb r1, [r1, #1] 3497; CHECK-NEXT: muls r2, r3, r2 3498; CHECK-NEXT: mla r0, r1, r0, r2 3499; CHECK-NEXT: uxtb r0, r0 3500; CHECK-NEXT: bx lr 3501entry: 3502 %0 = load i8, i8* %x, align 1 3503 %1 = load i8, i8* %y, align 1 3504 %mul = mul i8 %1, %0 3505 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1 3506 %2 = load i8, i8* %arrayidx.1, align 1 3507 %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1 3508 %3 = load i8, i8* %arrayidx1.1, align 1 3509 %mul.1 = mul i8 %3, %2 3510 %add.1 = add i8 %mul.1, %mul 3511 ret i8 %add.1 3512} 3513 3514define zeroext i8 @mlav4i8i8(i8* %x, i8* %y) { 3515; CHECK-LABEL: mlav4i8i8: 3516; CHECK: @ %bb.0: @ %entry 3517; CHECK-NEXT: vldrb.u32 q0, [r0] 3518; CHECK-NEXT: vldrb.u32 q1, [r1] 3519; CHECK-NEXT: vmlav.u32 r0, q1, q0 3520; CHECK-NEXT: uxtb r0, r0 3521; CHECK-NEXT: bx lr 3522entry: 3523 %0 = bitcast i8* %x to <4 x i8>* 3524 %1 = load <4 x i8>, <4 x i8>* %0, align 1 3525 %2 = bitcast i8* %y to <4 x i8>* 3526 %3 = load <4 x i8>, <4 x i8>* %2, align 1 3527 %4 = mul <4 x i8> %3, %1 3528 %5 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %4) 3529 ret i8 %5 3530} 3531 3532define zeroext i8 @mlav8i8i8(i8* %x, i8* %y) { 3533; CHECK-LABEL: mlav8i8i8: 3534; CHECK: @ %bb.0: @ %entry 3535; CHECK-NEXT: vldrb.u16 q0, [r0] 3536; CHECK-NEXT: vldrb.u16 q1, [r1] 3537; CHECK-NEXT: vmlav.u16 r0, q1, q0 3538; CHECK-NEXT: uxtb r0, r0 3539; CHECK-NEXT: bx lr 3540entry: 3541 %0 = bitcast i8* %x to <8 x i8>* 3542 %1 = load <8 x i8>, <8 x i8>* %0, align 1 3543 %2 = bitcast i8* %y to <8 x i8>* 3544 %3 = load <8 x i8>, <8 x i8>* %2, align 1 3545 %4 = mul <8 x i8> %3, %1 3546 %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4) 3547 ret i8 %5 3548} 3549 3550define zeroext i8 @mlav16i8i8(i8* %x, i8* %y) { 3551; CHECK-LABEL: mlav16i8i8: 3552; CHECK: @ %bb.0: @ %entry 3553; CHECK-NEXT: vldrb.u8 q0, [r0] 3554; CHECK-NEXT: vldrb.u8 q1, [r1] 3555; CHECK-NEXT: vmlav.u8 r0, q1, q0 3556; CHECK-NEXT: uxtb r0, r0 3557; CHECK-NEXT: bx lr 3558entry: 3559 %0 = bitcast i8* %x to <16 x i8>* 3560 %1 = load <16 x i8>, <16 x i8>* %0, align 1 3561 %2 = bitcast i8* %y to <16 x i8>* 3562 %3 = load <16 x i8>, <16 x i8>* %2, align 1 3563 %4 = mul <16 x i8> %3, %1 3564 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4) 3565 ret i8 %5 3566} 3567 3568define zeroext i8 @mlav24i8i8(i8* %x, i8* %y) { 3569; CHECK-LABEL: mlav24i8i8: 3570; CHECK: @ %bb.0: @ %entry 3571; CHECK-NEXT: vldrb.u16 q0, [r0] 3572; CHECK-NEXT: vldrb.u16 q1, [r1] 3573; CHECK-NEXT: vmlav.u16 r2, q1, q0 3574; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 3575; CHECK-NEXT: vldrb.u8 q1, [r1, #8] 3576; CHECK-NEXT: vmlava.u8 r2, q1, q0 3577; CHECK-NEXT: uxtb r0, r2 3578; CHECK-NEXT: bx lr 3579entry: 3580 %0 = bitcast i8* %x to <8 x i8>* 3581 %1 = load <8 x i8>, <8 x i8>* %0, align 1 3582 %2 = bitcast i8* %y to <8 x i8>* 3583 %3 = load <8 x i8>, <8 x i8>* %2, align 1 3584 %4 = mul <8 x i8> %3, %1 3585 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8 3586 %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8 3587 %5 = bitcast i8* %arrayidx.8 to <16 x i8>* 3588 %6 = load <16 x i8>, <16 x i8>* %5, align 1 3589 %7 = bitcast i8* %arrayidx1.8 to <16 x i8>* 3590 %8 = load <16 x i8>, <16 x i8>* %7, align 1 3591 %9 = mul <16 x i8> %8, %6 3592 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9) 3593 %11 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4) 3594 %op.rdx = add i8 %10, %11 3595 ret i8 %op.rdx 3596} 3597 3598define zeroext i8 @mlav32i8i8(i8* %x, i8* %y) { 3599; CHECK-LABEL: mlav32i8i8: 3600; CHECK: @ %bb.0: @ %entry 3601; CHECK-NEXT: vldrb.u8 q0, [r0] 3602; CHECK-NEXT: vldrb.u8 q1, [r1] 3603; CHECK-NEXT: vmlav.u8 r2, q1, q0 3604; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3605; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3606; CHECK-NEXT: vmlava.u8 r2, q1, q0 3607; CHECK-NEXT: uxtb r0, r2 3608; CHECK-NEXT: bx lr 3609entry: 3610 %0 = bitcast i8* %x to <32 x i8>* 3611 %1 = load <32 x i8>, <32 x i8>* %0, align 1 3612 %2 = bitcast i8* %y to <32 x i8>* 3613 %3 = load <32 x i8>, <32 x i8>* %2, align 1 3614 %4 = mul <32 x i8> %3, %1 3615 %5 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %4) 3616 ret i8 %5 3617} 3618 3619define zeroext i8 @mlav64i8i8(i8* %x, i8* %y) { 3620; CHECK-LABEL: mlav64i8i8: 3621; CHECK: @ %bb.0: @ %entry 3622; CHECK-NEXT: vldrb.u8 q0, [r0] 3623; CHECK-NEXT: vldrb.u8 q1, [r1] 3624; CHECK-NEXT: vmlav.u8 r2, q1, q0 3625; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3626; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3627; CHECK-NEXT: vmlava.u8 r2, q1, q0 3628; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 3629; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 3630; CHECK-NEXT: vmlava.u8 r2, q1, q0 3631; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 3632; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 3633; CHECK-NEXT: vmlava.u8 r2, q1, q0 3634; CHECK-NEXT: uxtb r0, r2 3635; CHECK-NEXT: bx lr 3636entry: 3637 %0 = bitcast i8* %x to <16 x i8>* 3638 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 3639 %1 = bitcast i8* %y to <16 x i8>* 3640 %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1 3641 %2 = mul <16 x i8> %wide.load12, %wide.load 3642 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 3643 %4 = getelementptr inbounds i8, i8* %x, i32 16 3644 %5 = bitcast i8* %4 to <16 x i8>* 3645 %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1 3646 %6 = getelementptr inbounds i8, i8* %y, i32 16 3647 %7 = bitcast i8* %6 to <16 x i8>* 3648 %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1 3649 %8 = mul <16 x i8> %wide.load12.1, %wide.load.1 3650 %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8) 3651 %10 = add i8 %9, %3 3652 %11 = getelementptr inbounds i8, i8* %x, i32 32 3653 %12 = bitcast i8* %11 to <16 x i8>* 3654 %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1 3655 %13 = getelementptr inbounds i8, i8* %y, i32 32 3656 %14 = bitcast i8* %13 to <16 x i8>* 3657 %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1 3658 %15 = mul <16 x i8> %wide.load12.2, %wide.load.2 3659 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15) 3660 %17 = add i8 %16, %10 3661 %18 = getelementptr inbounds i8, i8* %x, i32 48 3662 %19 = bitcast i8* %18 to <16 x i8>* 3663 %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1 3664 %20 = getelementptr inbounds i8, i8* %y, i32 48 3665 %21 = bitcast i8* %20 to <16 x i8>* 3666 %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1 3667 %22 = mul <16 x i8> %wide.load12.3, %wide.load.3 3668 %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22) 3669 %24 = add i8 %23, %17 3670 ret i8 %24 3671} 3672 3673define zeroext i8 @mlav128i8i8(i8* %x, i8* %y) { 3674; CHECK-LABEL: mlav128i8i8: 3675; CHECK: @ %bb.0: @ %entry 3676; CHECK-NEXT: vldrb.u8 q0, [r0] 3677; CHECK-NEXT: vldrb.u8 q1, [r1] 3678; CHECK-NEXT: vmlav.u8 r2, q1, q0 3679; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3680; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3681; CHECK-NEXT: vmlava.u8 r2, q1, q0 3682; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 3683; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 3684; CHECK-NEXT: vmlava.u8 r2, q1, q0 3685; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 3686; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 3687; CHECK-NEXT: vmlava.u8 r2, q1, q0 3688; CHECK-NEXT: vldrb.u8 q0, [r0, #64] 3689; CHECK-NEXT: vldrb.u8 q1, [r1, #64] 3690; CHECK-NEXT: vmlava.u8 r2, q1, q0 3691; CHECK-NEXT: vldrb.u8 q0, [r0, #80] 3692; CHECK-NEXT: vldrb.u8 q1, [r1, #80] 3693; CHECK-NEXT: vmlava.u8 r2, q1, q0 3694; CHECK-NEXT: vldrb.u8 q0, [r0, #96] 3695; CHECK-NEXT: vldrb.u8 q1, [r1, #96] 3696; CHECK-NEXT: vmlava.u8 r2, q1, q0 3697; CHECK-NEXT: vldrb.u8 q0, [r0, #112] 3698; CHECK-NEXT: vldrb.u8 q1, [r1, #112] 3699; CHECK-NEXT: vmlava.u8 r2, q1, q0 3700; CHECK-NEXT: uxtb r0, r2 3701; CHECK-NEXT: bx lr 3702entry: 3703 %0 = bitcast i8* %x to <16 x i8>* 3704 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1 3705 %1 = bitcast i8* %y to <16 x i8>* 3706 %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1 3707 %2 = mul <16 x i8> %wide.load12, %wide.load 3708 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 3709 %4 = getelementptr inbounds i8, i8* %x, i32 16 3710 %5 = bitcast i8* %4 to <16 x i8>* 3711 %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1 3712 %6 = getelementptr inbounds i8, i8* %y, i32 16 3713 %7 = bitcast i8* %6 to <16 x i8>* 3714 %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1 3715 %8 = mul <16 x i8> %wide.load12.1, %wide.load.1 3716 %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8) 3717 %10 = add i8 %9, %3 3718 %11 = getelementptr inbounds i8, i8* %x, i32 32 3719 %12 = bitcast i8* %11 to <16 x i8>* 3720 %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1 3721 %13 = getelementptr inbounds i8, i8* %y, i32 32 3722 %14 = bitcast i8* %13 to <16 x i8>* 3723 %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1 3724 %15 = mul <16 x i8> %wide.load12.2, %wide.load.2 3725 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15) 3726 %17 = add i8 %16, %10 3727 %18 = getelementptr inbounds i8, i8* %x, i32 48 3728 %19 = bitcast i8* %18 to <16 x i8>* 3729 %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1 3730 %20 = getelementptr inbounds i8, i8* %y, i32 48 3731 %21 = bitcast i8* %20 to <16 x i8>* 3732 %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1 3733 %22 = mul <16 x i8> %wide.load12.3, %wide.load.3 3734 %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22) 3735 %24 = add i8 %23, %17 3736 %25 = getelementptr inbounds i8, i8* %x, i32 64 3737 %26 = bitcast i8* %25 to <16 x i8>* 3738 %wide.load.4 = load <16 x i8>, <16 x i8>* %26, align 1 3739 %27 = getelementptr inbounds i8, i8* %y, i32 64 3740 %28 = bitcast i8* %27 to <16 x i8>* 3741 %wide.load12.4 = load <16 x i8>, <16 x i8>* %28, align 1 3742 %29 = mul <16 x i8> %wide.load12.4, %wide.load.4 3743 %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29) 3744 %31 = add i8 %30, %24 3745 %32 = getelementptr inbounds i8, i8* %x, i32 80 3746 %33 = bitcast i8* %32 to <16 x i8>* 3747 %wide.load.5 = load <16 x i8>, <16 x i8>* %33, align 1 3748 %34 = getelementptr inbounds i8, i8* %y, i32 80 3749 %35 = bitcast i8* %34 to <16 x i8>* 3750 %wide.load12.5 = load <16 x i8>, <16 x i8>* %35, align 1 3751 %36 = mul <16 x i8> %wide.load12.5, %wide.load.5 3752 %37 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %36) 3753 %38 = add i8 %37, %31 3754 %39 = getelementptr inbounds i8, i8* %x, i32 96 3755 %40 = bitcast i8* %39 to <16 x i8>* 3756 %wide.load.6 = load <16 x i8>, <16 x i8>* %40, align 1 3757 %41 = getelementptr inbounds i8, i8* %y, i32 96 3758 %42 = bitcast i8* %41 to <16 x i8>* 3759 %wide.load12.6 = load <16 x i8>, <16 x i8>* %42, align 1 3760 %43 = mul <16 x i8> %wide.load12.6, %wide.load.6 3761 %44 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %43) 3762 %45 = add i8 %44, %38 3763 %46 = getelementptr inbounds i8, i8* %x, i32 112 3764 %47 = bitcast i8* %46 to <16 x i8>* 3765 %wide.load.7 = load <16 x i8>, <16 x i8>* %47, align 1 3766 %48 = getelementptr inbounds i8, i8* %y, i32 112 3767 %49 = bitcast i8* %48 to <16 x i8>* 3768 %wide.load12.7 = load <16 x i8>, <16 x i8>* %49, align 1 3769 %50 = mul <16 x i8> %wide.load12.7, %wide.load.7 3770 %51 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %50) 3771 %52 = add i8 %51, %45 3772 ret i8 %52 3773} 3774 3775 3776define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) { 3777; CHECK-LABEL: add_two_const: 3778; CHECK: @ %bb.0: @ %entry 3779; CHECK-NEXT: vaddv.u32 r0, q1 3780; CHECK-NEXT: vaddva.u32 r0, q0 3781; CHECK-NEXT: adds r0, #10 3782; CHECK-NEXT: bx lr 3783entry: 3784 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3785 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3786 %c = add i32 %a, %b 3787 %d = add i32 %c, 10 3788 ret i32 %d 3789} 3790 3791define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) { 3792; CHECK-LABEL: add_two_const2: 3793; CHECK: @ %bb.0: @ %entry 3794; CHECK-NEXT: vaddv.u32 r0, q1 3795; CHECK-NEXT: vaddva.u32 r0, q0 3796; CHECK-NEXT: adds r0, #10 3797; CHECK-NEXT: bx lr 3798entry: 3799 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3800 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3801 %c = add i32 %a, 10 3802 %d = add i32 %c, %b 3803 ret i32 %d 3804} 3805 3806define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) { 3807; CHECK-LABEL: add_two_const3: 3808; CHECK: @ %bb.0: @ %entry 3809; CHECK-NEXT: vaddv.u32 r0, q0 3810; CHECK-NEXT: vaddva.u32 r0, q1 3811; CHECK-NEXT: adds r0, #20 3812; CHECK-NEXT: bx lr 3813entry: 3814 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3815 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3816 %c = add i32 %a, 10 3817 %d = add i32 %b, 10 3818 %e = add i32 %c, %d 3819 ret i32 %e 3820} 3821 3822declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 3823declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 3824declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 3825declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 3826declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 3827declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 3828declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 3829declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 3830declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 3831declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 3832declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 3833declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 3834declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 3835declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 3836declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 3837