1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s 3 4define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5; CHECK-LABEL: shadd8b: 6; CHECK: // %bb.0: 7; CHECK-NEXT: ldr d0, [x0] 8; CHECK-NEXT: ldr d1, [x1] 9; CHECK-NEXT: shadd.8b v0, v0, v1 10; CHECK-NEXT: ret 11 %tmp1 = load <8 x i8>, <8 x i8>* %A 12 %tmp2 = load <8 x i8>, <8 x i8>* %B 13 %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 14 ret <8 x i8> %tmp3 15} 16 17define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 18; CHECK-LABEL: shadd16b: 19; CHECK: // %bb.0: 20; CHECK-NEXT: ldr q0, [x0] 21; CHECK-NEXT: ldr q1, [x1] 22; CHECK-NEXT: shadd.16b v0, v0, v1 23; CHECK-NEXT: ret 24 %tmp1 = load <16 x i8>, <16 x i8>* %A 25 %tmp2 = load <16 x i8>, <16 x i8>* %B 26 %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 27 ret <16 x i8> %tmp3 28} 29 30define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 31; CHECK-LABEL: shadd4h: 32; CHECK: // %bb.0: 33; CHECK-NEXT: ldr d0, [x0] 34; CHECK-NEXT: ldr d1, [x1] 35; CHECK-NEXT: shadd.4h v0, v0, v1 36; CHECK-NEXT: ret 37 %tmp1 = load <4 x i16>, <4 x i16>* %A 38 %tmp2 = load <4 x i16>, <4 x i16>* %B 39 %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 40 ret <4 x i16> %tmp3 41} 42 43define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 44; CHECK-LABEL: shadd8h: 45; CHECK: // %bb.0: 46; CHECK-NEXT: ldr q0, [x0] 47; CHECK-NEXT: ldr q1, [x1] 48; CHECK-NEXT: shadd.8h v0, v0, v1 49; CHECK-NEXT: ret 50 %tmp1 = load <8 x i16>, <8 x i16>* %A 51 %tmp2 = load <8 x i16>, <8 x i16>* %B 52 %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 53 ret <8 x i16> %tmp3 54} 55 56define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 57; CHECK-LABEL: shadd2s: 58; CHECK: // %bb.0: 59; CHECK-NEXT: ldr d0, [x0] 60; CHECK-NEXT: ldr d1, [x1] 61; CHECK-NEXT: shadd.2s v0, v0, v1 62; CHECK-NEXT: ret 63 %tmp1 = load <2 x i32>, <2 x i32>* %A 64 %tmp2 = load <2 x i32>, <2 x i32>* %B 65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 66 ret <2 x i32> %tmp3 67} 68 69define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 70; CHECK-LABEL: shadd4s: 71; CHECK: // %bb.0: 72; CHECK-NEXT: ldr q0, [x0] 73; CHECK-NEXT: ldr q1, [x1] 74; CHECK-NEXT: shadd.4s v0, v0, v1 75; CHECK-NEXT: ret 76 %tmp1 = load <4 x i32>, <4 x i32>* %A 77 %tmp2 = load <4 x i32>, <4 x i32>* %B 78 %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 79 ret <4 x i32> %tmp3 80} 81 82define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 83; CHECK-LABEL: uhadd8b: 84; CHECK: // %bb.0: 85; CHECK-NEXT: ldr d0, [x0] 86; CHECK-NEXT: ldr d1, [x1] 87; CHECK-NEXT: uhadd.8b v0, v0, v1 88; CHECK-NEXT: ret 89 %tmp1 = load <8 x i8>, <8 x i8>* %A 90 %tmp2 = load <8 x i8>, <8 x i8>* %B 91 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 92 ret <8 x i8> %tmp3 93} 94 95define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 96; CHECK-LABEL: uhadd16b: 97; CHECK: // %bb.0: 98; CHECK-NEXT: ldr q0, [x0] 99; CHECK-NEXT: ldr q1, [x1] 100; CHECK-NEXT: uhadd.16b v0, v0, v1 101; CHECK-NEXT: ret 102 %tmp1 = load <16 x i8>, <16 x i8>* %A 103 %tmp2 = load <16 x i8>, <16 x i8>* %B 104 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 105 ret <16 x i8> %tmp3 106} 107 108define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 109; CHECK-LABEL: uhadd4h: 110; CHECK: // %bb.0: 111; CHECK-NEXT: ldr d0, [x0] 112; CHECK-NEXT: ldr d1, [x1] 113; CHECK-NEXT: uhadd.4h v0, v0, v1 114; CHECK-NEXT: ret 115 %tmp1 = load <4 x i16>, <4 x i16>* %A 116 %tmp2 = load <4 x i16>, <4 x i16>* %B 117 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 118 ret <4 x i16> %tmp3 119} 120 121define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 122; CHECK-LABEL: uhadd8h: 123; CHECK: // %bb.0: 124; CHECK-NEXT: ldr q0, [x0] 125; CHECK-NEXT: ldr q1, [x1] 126; CHECK-NEXT: uhadd.8h v0, v0, v1 127; CHECK-NEXT: ret 128 %tmp1 = load <8 x i16>, <8 x i16>* %A 129 %tmp2 = load <8 x i16>, <8 x i16>* %B 130 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 131 ret <8 x i16> %tmp3 132} 133 134define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 135; CHECK-LABEL: uhadd2s: 136; CHECK: // %bb.0: 137; CHECK-NEXT: ldr d0, [x0] 138; CHECK-NEXT: ldr d1, [x1] 139; CHECK-NEXT: uhadd.2s v0, v0, v1 140; CHECK-NEXT: ret 141 %tmp1 = load <2 x i32>, <2 x i32>* %A 142 %tmp2 = load <2 x i32>, <2 x i32>* %B 143 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 144 ret <2 x i32> %tmp3 145} 146 147define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 148; CHECK-LABEL: uhadd4s: 149; CHECK: // %bb.0: 150; CHECK-NEXT: ldr q0, [x0] 151; CHECK-NEXT: ldr q1, [x1] 152; CHECK-NEXT: uhadd.4s v0, v0, v1 153; CHECK-NEXT: ret 154 %tmp1 = load <4 x i32>, <4 x i32>* %A 155 %tmp2 = load <4 x i32>, <4 x i32>* %B 156 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 157 ret <4 x i32> %tmp3 158} 159 160declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 161declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 162declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 163 164declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 165declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 166declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 167 168declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 169declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 170declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 171 172declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 173declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 174declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 175 176define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 177; CHECK-LABEL: srhadd8b: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ldr d0, [x0] 180; CHECK-NEXT: ldr d1, [x1] 181; CHECK-NEXT: srhadd.8b v0, v0, v1 182; CHECK-NEXT: ret 183 %tmp1 = load <8 x i8>, <8 x i8>* %A 184 %tmp2 = load <8 x i8>, <8 x i8>* %B 185 %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 186 ret <8 x i8> %tmp3 187} 188 189define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 190; CHECK-LABEL: srhadd16b: 191; CHECK: // %bb.0: 192; CHECK-NEXT: ldr q0, [x0] 193; CHECK-NEXT: ldr q1, [x1] 194; CHECK-NEXT: srhadd.16b v0, v0, v1 195; CHECK-NEXT: ret 196 %tmp1 = load <16 x i8>, <16 x i8>* %A 197 %tmp2 = load <16 x i8>, <16 x i8>* %B 198 %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 199 ret <16 x i8> %tmp3 200} 201 202define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 203; CHECK-LABEL: srhadd4h: 204; CHECK: // %bb.0: 205; CHECK-NEXT: ldr d0, [x0] 206; CHECK-NEXT: ldr d1, [x1] 207; CHECK-NEXT: srhadd.4h v0, v0, v1 208; CHECK-NEXT: ret 209 %tmp1 = load <4 x i16>, <4 x i16>* %A 210 %tmp2 = load <4 x i16>, <4 x i16>* %B 211 %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 212 ret <4 x i16> %tmp3 213} 214 215define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 216; CHECK-LABEL: srhadd8h: 217; CHECK: // %bb.0: 218; CHECK-NEXT: ldr q0, [x0] 219; CHECK-NEXT: ldr q1, [x1] 220; CHECK-NEXT: srhadd.8h v0, v0, v1 221; CHECK-NEXT: ret 222 %tmp1 = load <8 x i16>, <8 x i16>* %A 223 %tmp2 = load <8 x i16>, <8 x i16>* %B 224 %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 225 ret <8 x i16> %tmp3 226} 227 228define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 229; CHECK-LABEL: srhadd2s: 230; CHECK: // %bb.0: 231; CHECK-NEXT: ldr d0, [x0] 232; CHECK-NEXT: ldr d1, [x1] 233; CHECK-NEXT: srhadd.2s v0, v0, v1 234; CHECK-NEXT: ret 235 %tmp1 = load <2 x i32>, <2 x i32>* %A 236 %tmp2 = load <2 x i32>, <2 x i32>* %B 237 %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 238 ret <2 x i32> %tmp3 239} 240 241define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 242; CHECK-LABEL: srhadd4s: 243; CHECK: // %bb.0: 244; CHECK-NEXT: ldr q0, [x0] 245; CHECK-NEXT: ldr q1, [x1] 246; CHECK-NEXT: srhadd.4s v0, v0, v1 247; CHECK-NEXT: ret 248 %tmp1 = load <4 x i32>, <4 x i32>* %A 249 %tmp2 = load <4 x i32>, <4 x i32>* %B 250 %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 251 ret <4 x i32> %tmp3 252} 253 254define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 255; CHECK-LABEL: urhadd8b: 256; CHECK: // %bb.0: 257; CHECK-NEXT: ldr d0, [x0] 258; CHECK-NEXT: ldr d1, [x1] 259; CHECK-NEXT: urhadd.8b v0, v0, v1 260; CHECK-NEXT: ret 261 %tmp1 = load <8 x i8>, <8 x i8>* %A 262 %tmp2 = load <8 x i8>, <8 x i8>* %B 263 %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 264 ret <8 x i8> %tmp3 265} 266 267define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 268; CHECK-LABEL: urhadd16b: 269; CHECK: // %bb.0: 270; CHECK-NEXT: ldr q0, [x0] 271; CHECK-NEXT: ldr q1, [x1] 272; CHECK-NEXT: urhadd.16b v0, v0, v1 273; CHECK-NEXT: ret 274 %tmp1 = load <16 x i8>, <16 x i8>* %A 275 %tmp2 = load <16 x i8>, <16 x i8>* %B 276 %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 277 ret <16 x i8> %tmp3 278} 279 280define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 281; CHECK-LABEL: urhadd4h: 282; CHECK: // %bb.0: 283; CHECK-NEXT: ldr d0, [x0] 284; CHECK-NEXT: ldr d1, [x1] 285; CHECK-NEXT: urhadd.4h v0, v0, v1 286; CHECK-NEXT: ret 287 %tmp1 = load <4 x i16>, <4 x i16>* %A 288 %tmp2 = load <4 x i16>, <4 x i16>* %B 289 %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 290 ret <4 x i16> %tmp3 291} 292 293define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 294; CHECK-LABEL: urhadd8h: 295; CHECK: // %bb.0: 296; CHECK-NEXT: ldr q0, [x0] 297; CHECK-NEXT: ldr q1, [x1] 298; CHECK-NEXT: urhadd.8h v0, v0, v1 299; CHECK-NEXT: ret 300 %tmp1 = load <8 x i16>, <8 x i16>* %A 301 %tmp2 = load <8 x i16>, <8 x i16>* %B 302 %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 303 ret <8 x i16> %tmp3 304} 305 306define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 307; CHECK-LABEL: urhadd2s: 308; CHECK: // %bb.0: 309; CHECK-NEXT: ldr d0, [x0] 310; CHECK-NEXT: ldr d1, [x1] 311; CHECK-NEXT: urhadd.2s v0, v0, v1 312; CHECK-NEXT: ret 313 %tmp1 = load <2 x i32>, <2 x i32>* %A 314 %tmp2 = load <2 x i32>, <2 x i32>* %B 315 %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 316 ret <2 x i32> %tmp3 317} 318 319define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 320; CHECK-LABEL: urhadd4s: 321; CHECK: // %bb.0: 322; CHECK-NEXT: ldr q0, [x0] 323; CHECK-NEXT: ldr q1, [x1] 324; CHECK-NEXT: urhadd.4s v0, v0, v1 325; CHECK-NEXT: ret 326 %tmp1 = load <4 x i32>, <4 x i32>* %A 327 %tmp2 = load <4 x i32>, <4 x i32>* %B 328 %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 329 ret <4 x i32> %tmp3 330} 331 332define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { 333; CHECK-LABEL: testLowerToSRHADD8b: 334; CHECK: // %bb.0: 335; CHECK-NEXT: srhadd.8b v0, v0, v1 336; CHECK-NEXT: str d0, [x0] 337; CHECK-NEXT: ret 338 %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> 339 %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> 340 %add1 = add <8 x i16> %sextsrc1, %sextsrc2 341 %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 342 %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 343 %result = trunc <8 x i16> %resulti16 to <8 x i8> 344 store <8 x i8> %result, <8 x i8>* %dest, align 8 345 ret void 346} 347 348define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { 349; CHECK-LABEL: testLowerToSRHADD4h: 350; CHECK: // %bb.0: 351; CHECK-NEXT: srhadd.4h v0, v0, v1 352; CHECK-NEXT: str d0, [x0] 353; CHECK-NEXT: ret 354 %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> 355 %sextsrc2 = sext <4 x i16> %src2 to <4 x i32> 356 %add1 = add <4 x i32> %sextsrc1, %sextsrc2 357 %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1> 358 %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1> 359 %result = trunc <4 x i32> %resulti16 to <4 x i16> 360 store <4 x i16> %result, <4 x i16>* %dest, align 8 361 ret void 362} 363 364define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { 365; CHECK-LABEL: testLowerToSRHADD2s: 366; CHECK: // %bb.0: 367; CHECK-NEXT: srhadd.2s v0, v0, v1 368; CHECK-NEXT: str d0, [x0] 369; CHECK-NEXT: ret 370 %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> 371 %sextsrc2 = sext <2 x i32> %src2 to <2 x i64> 372 %add1 = add <2 x i64> %sextsrc1, %sextsrc2 373 %add2 = add <2 x i64> %add1, <i64 1, i64 1> 374 %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1> 375 %result = trunc <2 x i64> %resulti16 to <2 x i32> 376 store <2 x i32> %result, <2 x i32>* %dest, align 8 377 ret void 378} 379 380define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { 381; CHECK-LABEL: testLowerToSRHADD16b: 382; CHECK: // %bb.0: 383; CHECK-NEXT: srhadd.16b v0, v0, v1 384; CHECK-NEXT: str q0, [x0] 385; CHECK-NEXT: ret 386 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> 387 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> 388 %add1 = add <16 x i16> %sextsrc1, %sextsrc2 389 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 390 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 391 %result = trunc <16 x i16> %resulti16 to <16 x i8> 392 store <16 x i8> %result, <16 x i8>* %dest, align 16 393 ret void 394} 395 396define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { 397; CHECK-LABEL: testLowerToSRHADD8h: 398; CHECK: // %bb.0: 399; CHECK-NEXT: srhadd.8h v0, v0, v1 400; CHECK-NEXT: str q0, [x0] 401; CHECK-NEXT: ret 402 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> 403 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> 404 %add1 = add <8 x i32> %sextsrc1, %sextsrc2 405 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 406 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 407 %result = trunc <8 x i32> %resulti16 to <8 x i16> 408 store <8 x i16> %result, <8 x i16>* %dest, align 16 409 ret void 410} 411 412define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { 413; CHECK-LABEL: testLowerToSRHADD4s: 414; CHECK: // %bb.0: 415; CHECK-NEXT: srhadd.4s v0, v0, v1 416; CHECK-NEXT: str q0, [x0] 417; CHECK-NEXT: ret 418 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> 419 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> 420 %add1 = add <4 x i64> %sextsrc1, %sextsrc2 421 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1> 422 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1> 423 %result = trunc <4 x i64> %resulti16 to <4 x i32> 424 store <4 x i32> %result, <4 x i32>* %dest, align 16 425 ret void 426} 427 428define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { 429; CHECK-LABEL: testLowerToURHADD8b: 430; CHECK: // %bb.0: 431; CHECK-NEXT: urhadd.8b v0, v0, v1 432; CHECK-NEXT: str d0, [x0] 433; CHECK-NEXT: ret 434 %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> 435 %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> 436 %add1 = add <8 x i16> %zextsrc1, %zextsrc2 437 %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 438 %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 439 %result = trunc <8 x i16> %resulti16 to <8 x i8> 440 store <8 x i8> %result, <8 x i8>* %dest, align 8 441 ret void 442} 443 444define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { 445; CHECK-LABEL: testLowerToURHADD4h: 446; CHECK: // %bb.0: 447; CHECK-NEXT: urhadd.4h v0, v0, v1 448; CHECK-NEXT: str d0, [x0] 449; CHECK-NEXT: ret 450 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> 451 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> 452 %add1 = add <4 x i32> %zextsrc1, %zextsrc2 453 %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1> 454 %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1> 455 %result = trunc <4 x i32> %resulti16 to <4 x i16> 456 store <4 x i16> %result, <4 x i16>* %dest, align 8 457 ret void 458} 459 460define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { 461; CHECK-LABEL: testLowerToURHADD2s: 462; CHECK: // %bb.0: 463; CHECK-NEXT: urhadd.2s v0, v0, v1 464; CHECK-NEXT: str d0, [x0] 465; CHECK-NEXT: ret 466 %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> 467 %zextsrc2 = zext <2 x i32> %src2 to <2 x i64> 468 %add1 = add <2 x i64> %zextsrc1, %zextsrc2 469 %add2 = add <2 x i64> %add1, <i64 1, i64 1> 470 %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1> 471 %result = trunc <2 x i64> %resulti16 to <2 x i32> 472 store <2 x i32> %result, <2 x i32>* %dest, align 8 473 ret void 474} 475 476define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { 477; CHECK-LABEL: testLowerToURHADD16b: 478; CHECK: // %bb.0: 479; CHECK-NEXT: urhadd.16b v0, v0, v1 480; CHECK-NEXT: str q0, [x0] 481; CHECK-NEXT: ret 482 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> 483 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> 484 %add1 = add <16 x i16> %zextsrc1, %zextsrc2 485 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 486 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 487 %result = trunc <16 x i16> %resulti16 to <16 x i8> 488 store <16 x i8> %result, <16 x i8>* %dest, align 16 489 ret void 490} 491 492define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { 493; CHECK-LABEL: testLowerToURHADD8h: 494; CHECK: // %bb.0: 495; CHECK-NEXT: urhadd.8h v0, v0, v1 496; CHECK-NEXT: str q0, [x0] 497; CHECK-NEXT: ret 498 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> 499 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> 500 %add1 = add <8 x i32> %zextsrc1, %zextsrc2 501 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 502 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 503 %result = trunc <8 x i32> %resulti16 to <8 x i16> 504 store <8 x i16> %result, <8 x i16>* %dest, align 16 505 ret void 506} 507 508define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { 509; CHECK-LABEL: testLowerToURHADD4s: 510; CHECK: // %bb.0: 511; CHECK-NEXT: urhadd.4s v0, v0, v1 512; CHECK-NEXT: str q0, [x0] 513; CHECK-NEXT: ret 514 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> 515 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> 516 %add1 = add <4 x i64> %zextsrc1, %zextsrc2 517 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1> 518 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1> 519 %result = trunc <4 x i64> %resulti16 to <4 x i32> 520 store <4 x i32> %result, <4 x i32>* %dest, align 16 521 ret void 522} 523 524declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 525declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 526declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 527 528declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 529declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 530declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 531 532declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 533declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 534declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 535 536declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 537declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 538declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 539