1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s 3; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK 4 5; FALLBACK-NOT: remark:{{.*}} sabdl8h 6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 7; CHECK-LABEL: sabdl8h: 8; CHECK: // %bb.0: 9; CHECK-NEXT: ldr d0, [x0] 10; CHECK-NEXT: ldr d1, [x1] 11; CHECK-NEXT: sabdl.8h v0, v0, v1 12; CHECK-NEXT: ret 13 %tmp1 = load <8 x i8>, <8 x i8>* %A 14 %tmp2 = load <8 x i8>, <8 x i8>* %B 15 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 16 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 17 ret <8 x i16> %tmp4 18} 19 20; FALLBACK-NOT: remark:{{.*}} sabdl4s 21define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 22; CHECK-LABEL: sabdl4s: 23; CHECK: // %bb.0: 24; CHECK-NEXT: ldr d0, [x0] 25; CHECK-NEXT: ldr d1, [x1] 26; CHECK-NEXT: sabdl.4s v0, v0, v1 27; CHECK-NEXT: ret 28 %tmp1 = load <4 x i16>, <4 x i16>* %A 29 %tmp2 = load <4 x i16>, <4 x i16>* %B 30 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 31 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 32 ret <4 x i32> %tmp4 33} 34 35; FALLBACK-NOT: remark:{{.*}} sabdl2d 36define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 37; CHECK-LABEL: sabdl2d: 38; CHECK: // %bb.0: 39; CHECK-NEXT: ldr d0, [x0] 40; CHECK-NEXT: ldr d1, [x1] 41; CHECK-NEXT: sabdl.2d v0, v0, v1 42; CHECK-NEXT: ret 43 %tmp1 = load <2 x i32>, <2 x i32>* %A 44 %tmp2 = load <2 x i32>, <2 x i32>* %B 45 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 46 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 47 ret <2 x i64> %tmp4 48} 49 50define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 51; CHECK-LABEL: sabdl2_8h: 52; CHECK: // %bb.0: 53; CHECK-NEXT: ldr d0, [x0, #8] 54; CHECK-NEXT: ldr d1, [x1, #8] 55; CHECK-NEXT: sabdl.8h v0, v0, v1 56; CHECK-NEXT: ret 57 %load1 = load <16 x i8>, <16 x i8>* %A 58 %load2 = load <16 x i8>, <16 x i8>* %B 59 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 60 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 61 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 62 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 63 ret <8 x i16> %tmp4 64} 65 66define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 67; CHECK-LABEL: sabdl2_4s: 68; CHECK: // %bb.0: 69; CHECK-NEXT: ldr d0, [x0, #8] 70; CHECK-NEXT: ldr d1, [x1, #8] 71; CHECK-NEXT: sabdl.4s v0, v0, v1 72; CHECK-NEXT: ret 73 %load1 = load <8 x i16>, <8 x i16>* %A 74 %load2 = load <8 x i16>, <8 x i16>* %B 75 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 76 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 77 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 78 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 79 ret <4 x i32> %tmp4 80} 81 82define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 83; CHECK-LABEL: sabdl2_2d: 84; CHECK: // %bb.0: 85; CHECK-NEXT: ldr d0, [x0, #8] 86; CHECK-NEXT: ldr d1, [x1, #8] 87; CHECK-NEXT: sabdl.2d v0, v0, v1 88; CHECK-NEXT: ret 89 %load1 = load <4 x i32>, <4 x i32>* %A 90 %load2 = load <4 x i32>, <4 x i32>* %B 91 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 92 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 93 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 94 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 95 ret <2 x i64> %tmp4 96} 97 98; FALLBACK-NOT: remark:{{.*}} uabdl8h) 99define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 100; CHECK-LABEL: uabdl8h: 101; CHECK: // %bb.0: 102; CHECK-NEXT: ldr d0, [x0] 103; CHECK-NEXT: ldr d1, [x1] 104; CHECK-NEXT: uabdl.8h v0, v0, v1 105; CHECK-NEXT: ret 106 %tmp1 = load <8 x i8>, <8 x i8>* %A 107 %tmp2 = load <8 x i8>, <8 x i8>* %B 108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 110 ret <8 x i16> %tmp4 111} 112 113; FALLBACK-NOT: remark:{{.*}} uabdl4s) 114define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 115; CHECK-LABEL: uabdl4s: 116; CHECK: // %bb.0: 117; CHECK-NEXT: ldr d0, [x0] 118; CHECK-NEXT: ldr d1, [x1] 119; CHECK-NEXT: uabdl.4s v0, v0, v1 120; CHECK-NEXT: ret 121 %tmp1 = load <4 x i16>, <4 x i16>* %A 122 %tmp2 = load <4 x i16>, <4 x i16>* %B 123 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 124 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 125 ret <4 x i32> %tmp4 126} 127 128; FALLBACK-NOT: remark:{{.*}} uabdl2d) 129define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 130; CHECK-LABEL: uabdl2d: 131; CHECK: // %bb.0: 132; CHECK-NEXT: ldr d0, [x0] 133; CHECK-NEXT: ldr d1, [x1] 134; CHECK-NEXT: uabdl.2d v0, v0, v1 135; CHECK-NEXT: ret 136 %tmp1 = load <2 x i32>, <2 x i32>* %A 137 %tmp2 = load <2 x i32>, <2 x i32>* %B 138 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 139 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 140 ret <2 x i64> %tmp4 141} 142 143define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 144; CHECK-LABEL: uabdl2_8h: 145; CHECK: // %bb.0: 146; CHECK-NEXT: ldr d0, [x0, #8] 147; CHECK-NEXT: ldr d1, [x1, #8] 148; CHECK-NEXT: uabdl.8h v0, v0, v1 149; CHECK-NEXT: ret 150 %load1 = load <16 x i8>, <16 x i8>* %A 151 %load2 = load <16 x i8>, <16 x i8>* %B 152 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 153 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 154 155 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 156 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 157 ret <8 x i16> %tmp4 158} 159 160define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 161; CHECK-LABEL: uabdl2_4s: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ldr d0, [x0, #8] 164; CHECK-NEXT: ldr d1, [x1, #8] 165; CHECK-NEXT: uabdl.4s v0, v0, v1 166; CHECK-NEXT: ret 167 %load1 = load <8 x i16>, <8 x i16>* %A 168 %load2 = load <8 x i16>, <8 x i16>* %B 169 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 170 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 171 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 172 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 173 ret <4 x i32> %tmp4 174} 175 176define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 177; CHECK-LABEL: uabdl2_2d: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ldr d0, [x0, #8] 180; CHECK-NEXT: ldr d1, [x1, #8] 181; CHECK-NEXT: uabdl.2d v0, v0, v1 182; CHECK-NEXT: ret 183 %load1 = load <4 x i32>, <4 x i32>* %A 184 %load2 = load <4 x i32>, <4 x i32>* %B 185 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 186 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 187 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 188 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 189 ret <2 x i64> %tmp4 190} 191 192declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 193declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 194 195define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) { 196; CHECK-LABEL: uabd16b_rdx: 197; CHECK: // %bb.0: 198; CHECK-NEXT: ldr q0, [x0] 199; CHECK-NEXT: ldr q1, [x1] 200; CHECK-NEXT: uabd.16b v0, v0, v1 201; CHECK-NEXT: ushll.8h v1, v0, #0 202; CHECK-NEXT: uaddw2.8h v0, v1, v0 203; CHECK-NEXT: addv.8h h0, v0 204; CHECK-NEXT: fmov w0, s0 205; CHECK-NEXT: ret 206 %aload = load <16 x i8>, <16 x i8>* %a, align 1 207 %bload = load <16 x i8>, <16 x i8>* %b, align 1 208 %aext = zext <16 x i8> %aload to <16 x i16> 209 %bext = zext <16 x i8> %bload to <16 x i16> 210 %abdiff = sub nsw <16 x i16> %aext, %bext 211 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 212 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 213 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 214 %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel) 215 ret i16 %reduced_v 216} 217 218define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { 219; CHECK-LABEL: uabd16b_rdx_i32: 220; CHECK: // %bb.0: 221; CHECK-NEXT: uabdl.8h v2, v0, v1 222; CHECK-NEXT: uabal2.8h v2, v0, v1 223; CHECK-NEXT: uaddlv.8h s0, v2 224; CHECK-NEXT: fmov w0, s0 225; CHECK-NEXT: ret 226 %aext = zext <16 x i8> %a to <16 x i32> 227 %bext = zext <16 x i8> %b to <16 x i32> 228 %abdiff = sub nsw <16 x i32> %aext, %bext 229 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer 230 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff 231 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff 232 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) 233 ret i32 %reduced_v 234} 235 236define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { 237; CHECK-LABEL: sabd16b_rdx_i32: 238; CHECK: // %bb.0: 239; CHECK-NEXT: sabdl.8h v2, v0, v1 240; CHECK-NEXT: sabal2.8h v2, v0, v1 241; CHECK-NEXT: uaddlv.8h s0, v2 242; CHECK-NEXT: fmov w0, s0 243; CHECK-NEXT: ret 244 %aext = sext <16 x i8> %a to <16 x i32> 245 %bext = sext <16 x i8> %b to <16 x i32> 246 %abdiff = sub nsw <16 x i32> %aext, %bext 247 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer 248 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff 249 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff 250 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) 251 ret i32 %reduced_v 252} 253 254 255declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 256declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 257 258define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) { 259; CHECK-LABEL: uabd8h_rdx: 260; CHECK: // %bb.0: 261; CHECK-NEXT: ldr q0, [x0] 262; CHECK-NEXT: ldr q1, [x1] 263; CHECK-NEXT: uabd.8h v0, v0, v1 264; CHECK-NEXT: ushll.4s v1, v0, #0 265; CHECK-NEXT: uaddw2.4s v0, v1, v0 266; CHECK-NEXT: addv.4s s0, v0 267; CHECK-NEXT: fmov w0, s0 268; CHECK-NEXT: ret 269 %aload = load <8 x i16>, <8 x i16>* %a, align 1 270 %bload = load <8 x i16>, <8 x i16>* %b, align 1 271 %aext = zext <8 x i16> %aload to <8 x i32> 272 %bext = zext <8 x i16> %bload to <8 x i32> 273 %abdiff = sub nsw <8 x i32> %aext, %bext 274 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 275 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 276 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 277 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel) 278 ret i32 %reduced_v 279} 280 281define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { 282; CHECK-LABEL: sabd8h_rdx: 283; CHECK: // %bb.0: 284; CHECK-NEXT: sabd.8h v0, v0, v1 285; CHECK-NEXT: ushll.4s v1, v0, #0 286; CHECK-NEXT: uaddw2.4s v0, v1, v0 287; CHECK-NEXT: addv.4s s0, v0 288; CHECK-NEXT: fmov w0, s0 289; CHECK-NEXT: ret 290 %aext = sext <8 x i16> %a to <8 x i32> 291 %bext = sext <8 x i16> %b to <8 x i32> 292 %abdiff = sub nsw <8 x i32> %aext, %bext 293 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 294 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 295 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 296 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel) 297 ret i32 %reduced_v 298} 299 300define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { 301; DAG-LABEL: uabdl4s_rdx_i32: 302; DAG: // %bb.0: 303; DAG-NEXT: uabdl.4s v0, v0, v1 304; DAG-NEXT: addv.4s s0, v0 305; DAG-NEXT: fmov w0, s0 306; DAG-NEXT: ret 307; 308; GISEL-LABEL: uabdl4s_rdx_i32: 309; GISEL: // %bb.0: 310; GISEL-NEXT: movi.2d v2, #0000000000000000 311; GISEL-NEXT: usubl.4s v0, v0, v1 312; GISEL-NEXT: cmgt.4s v1, v2, v0 313; GISEL-NEXT: shl.4s v1, v1, #31 314; GISEL-NEXT: neg.4s v2, v0 315; GISEL-NEXT: sshr.4s v1, v1, #31 316; GISEL-NEXT: bit.16b v0, v2, v1 317; GISEL-NEXT: addv.4s s0, v0 318; GISEL-NEXT: fmov w0, s0 319; GISEL-NEXT: ret 320 321; GISel doesn't match this pattern yet. 322 %aext = zext <4 x i16> %a to <4 x i32> 323 %bext = zext <4 x i16> %b to <4 x i32> 324 %abdiff = sub nsw <4 x i32> %aext, %bext 325 %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer 326 %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff 327 %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff 328 %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel) 329 ret i32 %reduced_v 330} 331 332declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 333declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 334 335define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 336; CHECK-LABEL: uabd4s_rdx: 337; CHECK: // %bb.0: 338; CHECK-NEXT: ldr q0, [x0] 339; CHECK-NEXT: ldr q1, [x1] 340; CHECK-NEXT: uabd.4s v0, v0, v1 341; CHECK-NEXT: ushll.2d v1, v0, #0 342; CHECK-NEXT: uaddw2.2d v0, v1, v0 343; CHECK-NEXT: addp.2d d0, v0 344; CHECK-NEXT: fmov x0, d0 345; CHECK-NEXT: ret 346 %aload = load <4 x i32>, <4 x i32>* %a, align 1 347 %bload = load <4 x i32>, <4 x i32>* %b, align 1 348 %aext = zext <4 x i32> %aload to <4 x i64> 349 %bext = zext <4 x i32> %bload to <4 x i64> 350 %abdiff = sub nsw <4 x i64> %aext, %bext 351 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 352 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 353 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 354 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel) 355 ret i64 %reduced_v 356} 357 358define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) { 359; CHECK-LABEL: sabd4s_rdx: 360; CHECK: // %bb.0: 361; CHECK-NEXT: sabd.4s v0, v0, v1 362; CHECK-NEXT: ushll.2d v1, v0, #0 363; CHECK-NEXT: uaddw2.2d v0, v1, v0 364; CHECK-NEXT: addp.2d d0, v0 365; CHECK-NEXT: fmov x0, d0 366; CHECK-NEXT: ret 367 %aext = sext <4 x i32> %a to <4 x i64> 368 %bext = sext <4 x i32> %b to <4 x i64> 369 %abdiff = sub nsw <4 x i64> %aext, %bext 370 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 371 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 372 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 373 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel) 374 ret i64 %reduced_v 375} 376 377define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) { 378; DAG-LABEL: uabdl2d_rdx_i64: 379; DAG: // %bb.0: 380; DAG-NEXT: uabdl.2d v0, v0, v1 381; DAG-NEXT: addp.2d d0, v0 382; DAG-NEXT: fmov x0, d0 383; DAG-NEXT: ret 384; 385; GISEL-LABEL: uabdl2d_rdx_i64: 386; GISEL: // %bb.0: 387; GISEL-NEXT: movi.2d v2, #0000000000000000 388; GISEL-NEXT: usubl.2d v0, v0, v1 389; GISEL-NEXT: cmgt.2d v1, v2, v0 390; GISEL-NEXT: shl.2d v1, v1, #63 391; GISEL-NEXT: neg.2d v2, v0 392; GISEL-NEXT: sshr.2d v1, v1, #63 393; GISEL-NEXT: bit.16b v0, v2, v1 394; GISEL-NEXT: addp.2d d0, v0 395; GISEL-NEXT: fmov x0, d0 396; GISEL-NEXT: ret 397 398; GISel doesn't match this pattern yet 399 %aext = zext <2 x i32> %a to <2 x i64> 400 %bext = zext <2 x i32> %b to <2 x i64> 401 %abdiff = sub nsw <2 x i64> %aext, %bext 402 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer 403 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff 404 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff 405 %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel) 406 ret i64 %reduced_v 407} 408 409define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 410; CHECK-LABEL: fabd_2s: 411; CHECK: // %bb.0: 412; CHECK-NEXT: ldr d0, [x0] 413; CHECK-NEXT: ldr d1, [x1] 414; CHECK-NEXT: fabd.2s v0, v0, v1 415; CHECK-NEXT: ret 416 %tmp1 = load <2 x float>, <2 x float>* %A 417 %tmp2 = load <2 x float>, <2 x float>* %B 418 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 419 ret <2 x float> %tmp3 420} 421 422define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 423; CHECK-LABEL: fabd_4s: 424; CHECK: // %bb.0: 425; CHECK-NEXT: ldr q0, [x0] 426; CHECK-NEXT: ldr q1, [x1] 427; CHECK-NEXT: fabd.4s v0, v0, v1 428; CHECK-NEXT: ret 429 %tmp1 = load <4 x float>, <4 x float>* %A 430 %tmp2 = load <4 x float>, <4 x float>* %B 431 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 432 ret <4 x float> %tmp3 433} 434 435define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 436; CHECK-LABEL: fabd_2d: 437; CHECK: // %bb.0: 438; CHECK-NEXT: ldr q0, [x0] 439; CHECK-NEXT: ldr q1, [x1] 440; CHECK-NEXT: fabd.2d v0, v0, v1 441; CHECK-NEXT: ret 442 %tmp1 = load <2 x double>, <2 x double>* %A 443 %tmp2 = load <2 x double>, <2 x double>* %B 444 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 445 ret <2 x double> %tmp3 446} 447 448declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 449declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 450declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 451 452define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind { 453; CHECK-LABEL: fabd_2s_from_fsub_fabs: 454; CHECK: // %bb.0: 455; CHECK-NEXT: ldr d0, [x0] 456; CHECK-NEXT: ldr d1, [x1] 457; CHECK-NEXT: fabd.2s v0, v0, v1 458; CHECK-NEXT: ret 459 %tmp1 = load <2 x float>, <2 x float>* %A 460 %tmp2 = load <2 x float>, <2 x float>* %B 461 %sub = fsub <2 x float> %tmp1, %tmp2 462 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub) 463 ret <2 x float> %abs 464} 465 466define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind { 467; CHECK-LABEL: fabd_4s_from_fsub_fabs: 468; CHECK: // %bb.0: 469; CHECK-NEXT: ldr q0, [x0] 470; CHECK-NEXT: ldr q1, [x1] 471; CHECK-NEXT: fabd.4s v0, v0, v1 472; CHECK-NEXT: ret 473 %tmp1 = load <4 x float>, <4 x float>* %A 474 %tmp2 = load <4 x float>, <4 x float>* %B 475 %sub = fsub <4 x float> %tmp1, %tmp2 476 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub) 477 ret <4 x float> %abs 478} 479 480define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind { 481; CHECK-LABEL: fabd_2d_from_fsub_fabs: 482; CHECK: // %bb.0: 483; CHECK-NEXT: ldr q0, [x0] 484; CHECK-NEXT: ldr q1, [x1] 485; CHECK-NEXT: fabd.2d v0, v0, v1 486; CHECK-NEXT: ret 487 %tmp1 = load <2 x double>, <2 x double>* %A 488 %tmp2 = load <2 x double>, <2 x double>* %B 489 %sub = fsub <2 x double> %tmp1, %tmp2 490 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub) 491 ret <2 x double> %abs 492} 493 494declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone 495declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone 496declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone 497 498define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 499; CHECK-LABEL: sabd_8b: 500; CHECK: // %bb.0: 501; CHECK-NEXT: ldr d0, [x0] 502; CHECK-NEXT: ldr d1, [x1] 503; CHECK-NEXT: sabd.8b v0, v0, v1 504; CHECK-NEXT: ret 505 %tmp1 = load <8 x i8>, <8 x i8>* %A 506 %tmp2 = load <8 x i8>, <8 x i8>* %B 507 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 508 ret <8 x i8> %tmp3 509} 510 511define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 512; CHECK-LABEL: sabd_16b: 513; CHECK: // %bb.0: 514; CHECK-NEXT: ldr q0, [x0] 515; CHECK-NEXT: ldr q1, [x1] 516; CHECK-NEXT: sabd.16b v0, v0, v1 517; CHECK-NEXT: ret 518 %tmp1 = load <16 x i8>, <16 x i8>* %A 519 %tmp2 = load <16 x i8>, <16 x i8>* %B 520 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 521 ret <16 x i8> %tmp3 522} 523 524define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 525; CHECK-LABEL: sabd_4h: 526; CHECK: // %bb.0: 527; CHECK-NEXT: ldr d0, [x0] 528; CHECK-NEXT: ldr d1, [x1] 529; CHECK-NEXT: sabd.4h v0, v0, v1 530; CHECK-NEXT: ret 531 %tmp1 = load <4 x i16>, <4 x i16>* %A 532 %tmp2 = load <4 x i16>, <4 x i16>* %B 533 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 534 ret <4 x i16> %tmp3 535} 536 537define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 538; CHECK-LABEL: sabd_8h: 539; CHECK: // %bb.0: 540; CHECK-NEXT: ldr q0, [x0] 541; CHECK-NEXT: ldr q1, [x1] 542; CHECK-NEXT: sabd.8h v0, v0, v1 543; CHECK-NEXT: ret 544 %tmp1 = load <8 x i16>, <8 x i16>* %A 545 %tmp2 = load <8 x i16>, <8 x i16>* %B 546 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 547 ret <8 x i16> %tmp3 548} 549 550define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 551; CHECK-LABEL: sabd_2s: 552; CHECK: // %bb.0: 553; CHECK-NEXT: ldr d0, [x0] 554; CHECK-NEXT: ldr d1, [x1] 555; CHECK-NEXT: sabd.2s v0, v0, v1 556; CHECK-NEXT: ret 557 %tmp1 = load <2 x i32>, <2 x i32>* %A 558 %tmp2 = load <2 x i32>, <2 x i32>* %B 559 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 560 ret <2 x i32> %tmp3 561} 562 563define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 564; CHECK-LABEL: sabd_4s: 565; CHECK: // %bb.0: 566; CHECK-NEXT: ldr q0, [x0] 567; CHECK-NEXT: ldr q1, [x1] 568; CHECK-NEXT: sabd.4s v0, v0, v1 569; CHECK-NEXT: ret 570 %tmp1 = load <4 x i32>, <4 x i32>* %A 571 %tmp2 = load <4 x i32>, <4 x i32>* %B 572 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 573 ret <4 x i32> %tmp3 574} 575 576declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 577declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 578declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 579declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 580declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 581declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 582 583define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 584; CHECK-LABEL: uabd_8b: 585; CHECK: // %bb.0: 586; CHECK-NEXT: ldr d0, [x0] 587; CHECK-NEXT: ldr d1, [x1] 588; CHECK-NEXT: uabd.8b v0, v0, v1 589; CHECK-NEXT: ret 590 %tmp1 = load <8 x i8>, <8 x i8>* %A 591 %tmp2 = load <8 x i8>, <8 x i8>* %B 592 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 593 ret <8 x i8> %tmp3 594} 595 596define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 597; CHECK-LABEL: uabd_16b: 598; CHECK: // %bb.0: 599; CHECK-NEXT: ldr q0, [x0] 600; CHECK-NEXT: ldr q1, [x1] 601; CHECK-NEXT: uabd.16b v0, v0, v1 602; CHECK-NEXT: ret 603 %tmp1 = load <16 x i8>, <16 x i8>* %A 604 %tmp2 = load <16 x i8>, <16 x i8>* %B 605 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 606 ret <16 x i8> %tmp3 607} 608 609define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 610; CHECK-LABEL: uabd_4h: 611; CHECK: // %bb.0: 612; CHECK-NEXT: ldr d0, [x0] 613; CHECK-NEXT: ldr d1, [x1] 614; CHECK-NEXT: uabd.4h v0, v0, v1 615; CHECK-NEXT: ret 616 %tmp1 = load <4 x i16>, <4 x i16>* %A 617 %tmp2 = load <4 x i16>, <4 x i16>* %B 618 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 619 ret <4 x i16> %tmp3 620} 621 622define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 623; CHECK-LABEL: uabd_8h: 624; CHECK: // %bb.0: 625; CHECK-NEXT: ldr q0, [x0] 626; CHECK-NEXT: ldr q1, [x1] 627; CHECK-NEXT: uabd.8h v0, v0, v1 628; CHECK-NEXT: ret 629 %tmp1 = load <8 x i16>, <8 x i16>* %A 630 %tmp2 = load <8 x i16>, <8 x i16>* %B 631 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 632 ret <8 x i16> %tmp3 633} 634 635define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 636; CHECK-LABEL: uabd_2s: 637; CHECK: // %bb.0: 638; CHECK-NEXT: ldr d0, [x0] 639; CHECK-NEXT: ldr d1, [x1] 640; CHECK-NEXT: uabd.2s v0, v0, v1 641; CHECK-NEXT: ret 642 %tmp1 = load <2 x i32>, <2 x i32>* %A 643 %tmp2 = load <2 x i32>, <2 x i32>* %B 644 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 645 ret <2 x i32> %tmp3 646} 647 648define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 649; CHECK-LABEL: uabd_4s: 650; CHECK: // %bb.0: 651; CHECK-NEXT: ldr q0, [x0] 652; CHECK-NEXT: ldr q1, [x1] 653; CHECK-NEXT: uabd.4s v0, v0, v1 654; CHECK-NEXT: ret 655 %tmp1 = load <4 x i32>, <4 x i32>* %A 656 %tmp2 = load <4 x i32>, <4 x i32>* %B 657 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 658 ret <4 x i32> %tmp3 659} 660 661declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 662declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 663declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 664declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 665declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 666declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 667 668define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 669; CHECK-LABEL: sqabs_8b: 670; CHECK: // %bb.0: 671; CHECK-NEXT: ldr d0, [x0] 672; CHECK-NEXT: sqabs.8b v0, v0 673; CHECK-NEXT: ret 674 %tmp1 = load <8 x i8>, <8 x i8>* %A 675 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 676 ret <8 x i8> %tmp3 677} 678 679define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 680; CHECK-LABEL: sqabs_16b: 681; CHECK: // %bb.0: 682; CHECK-NEXT: ldr q0, [x0] 683; CHECK-NEXT: sqabs.16b v0, v0 684; CHECK-NEXT: ret 685 %tmp1 = load <16 x i8>, <16 x i8>* %A 686 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 687 ret <16 x i8> %tmp3 688} 689 690define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 691; CHECK-LABEL: sqabs_4h: 692; CHECK: // %bb.0: 693; CHECK-NEXT: ldr d0, [x0] 694; CHECK-NEXT: sqabs.4h v0, v0 695; CHECK-NEXT: ret 696 %tmp1 = load <4 x i16>, <4 x i16>* %A 697 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 698 ret <4 x i16> %tmp3 699} 700 701define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 702; CHECK-LABEL: sqabs_8h: 703; CHECK: // %bb.0: 704; CHECK-NEXT: ldr q0, [x0] 705; CHECK-NEXT: sqabs.8h v0, v0 706; CHECK-NEXT: ret 707 %tmp1 = load <8 x i16>, <8 x i16>* %A 708 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 709 ret <8 x i16> %tmp3 710} 711 712define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 713; CHECK-LABEL: sqabs_2s: 714; CHECK: // %bb.0: 715; CHECK-NEXT: ldr d0, [x0] 716; CHECK-NEXT: sqabs.2s v0, v0 717; CHECK-NEXT: ret 718 %tmp1 = load <2 x i32>, <2 x i32>* %A 719 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 720 ret <2 x i32> %tmp3 721} 722 723define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 724; CHECK-LABEL: sqabs_4s: 725; CHECK: // %bb.0: 726; CHECK-NEXT: ldr q0, [x0] 727; CHECK-NEXT: sqabs.4s v0, v0 728; CHECK-NEXT: ret 729 %tmp1 = load <4 x i32>, <4 x i32>* %A 730 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 731 ret <4 x i32> %tmp3 732} 733 734declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 735declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 736declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 737declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 738declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 739declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 740 741define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 742; CHECK-LABEL: sqneg_8b: 743; CHECK: // %bb.0: 744; CHECK-NEXT: ldr d0, [x0] 745; CHECK-NEXT: sqneg.8b v0, v0 746; CHECK-NEXT: ret 747 %tmp1 = load <8 x i8>, <8 x i8>* %A 748 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 749 ret <8 x i8> %tmp3 750} 751 752define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 753; CHECK-LABEL: sqneg_16b: 754; CHECK: // %bb.0: 755; CHECK-NEXT: ldr q0, [x0] 756; CHECK-NEXT: sqneg.16b v0, v0 757; CHECK-NEXT: ret 758 %tmp1 = load <16 x i8>, <16 x i8>* %A 759 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 760 ret <16 x i8> %tmp3 761} 762 763define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 764; CHECK-LABEL: sqneg_4h: 765; CHECK: // %bb.0: 766; CHECK-NEXT: ldr d0, [x0] 767; CHECK-NEXT: sqneg.4h v0, v0 768; CHECK-NEXT: ret 769 %tmp1 = load <4 x i16>, <4 x i16>* %A 770 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 771 ret <4 x i16> %tmp3 772} 773 774define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 775; CHECK-LABEL: sqneg_8h: 776; CHECK: // %bb.0: 777; CHECK-NEXT: ldr q0, [x0] 778; CHECK-NEXT: sqneg.8h v0, v0 779; CHECK-NEXT: ret 780 %tmp1 = load <8 x i16>, <8 x i16>* %A 781 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 782 ret <8 x i16> %tmp3 783} 784 785define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 786; CHECK-LABEL: sqneg_2s: 787; CHECK: // %bb.0: 788; CHECK-NEXT: ldr d0, [x0] 789; CHECK-NEXT: sqneg.2s v0, v0 790; CHECK-NEXT: ret 791 %tmp1 = load <2 x i32>, <2 x i32>* %A 792 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 793 ret <2 x i32> %tmp3 794} 795 796define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 797; CHECK-LABEL: sqneg_4s: 798; CHECK: // %bb.0: 799; CHECK-NEXT: ldr q0, [x0] 800; CHECK-NEXT: sqneg.4s v0, v0 801; CHECK-NEXT: ret 802 %tmp1 = load <4 x i32>, <4 x i32>* %A 803 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 804 ret <4 x i32> %tmp3 805} 806 807declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 808declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 809declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 810declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 811declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 812declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 813 814define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 815; CHECK-LABEL: abs_8b: 816; CHECK: // %bb.0: 817; CHECK-NEXT: ldr d0, [x0] 818; CHECK-NEXT: abs.8b v0, v0 819; CHECK-NEXT: ret 820 %tmp1 = load <8 x i8>, <8 x i8>* %A 821 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 822 ret <8 x i8> %tmp3 823} 824 825define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 826; CHECK-LABEL: abs_16b: 827; CHECK: // %bb.0: 828; CHECK-NEXT: ldr q0, [x0] 829; CHECK-NEXT: abs.16b v0, v0 830; CHECK-NEXT: ret 831 %tmp1 = load <16 x i8>, <16 x i8>* %A 832 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 833 ret <16 x i8> %tmp3 834} 835 836define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 837; CHECK-LABEL: abs_4h: 838; CHECK: // %bb.0: 839; CHECK-NEXT: ldr d0, [x0] 840; CHECK-NEXT: abs.4h v0, v0 841; CHECK-NEXT: ret 842 %tmp1 = load <4 x i16>, <4 x i16>* %A 843 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 844 ret <4 x i16> %tmp3 845} 846 847define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 848; CHECK-LABEL: abs_8h: 849; CHECK: // %bb.0: 850; CHECK-NEXT: ldr q0, [x0] 851; CHECK-NEXT: abs.8h v0, v0 852; CHECK-NEXT: ret 853 %tmp1 = load <8 x i16>, <8 x i16>* %A 854 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 855 ret <8 x i16> %tmp3 856} 857 858define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 859; CHECK-LABEL: abs_2s: 860; CHECK: // %bb.0: 861; CHECK-NEXT: ldr d0, [x0] 862; CHECK-NEXT: abs.2s v0, v0 863; CHECK-NEXT: ret 864 %tmp1 = load <2 x i32>, <2 x i32>* %A 865 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 866 ret <2 x i32> %tmp3 867} 868 869define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 870; CHECK-LABEL: abs_4s: 871; CHECK: // %bb.0: 872; CHECK-NEXT: ldr q0, [x0] 873; CHECK-NEXT: abs.4s v0, v0 874; CHECK-NEXT: ret 875 %tmp1 = load <4 x i32>, <4 x i32>* %A 876 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 877 ret <4 x i32> %tmp3 878} 879 880define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 881; CHECK-LABEL: abs_1d: 882; CHECK: // %bb.0: 883; CHECK-NEXT: abs d0, d0 884; CHECK-NEXT: ret 885 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 886 ret <1 x i64> %abs 887} 888 889define i64 @abs_1d_honestly(i64 %A) nounwind { 890; CHECK-LABEL: abs_1d_honestly: 891; CHECK: // %bb.0: 892; CHECK-NEXT: fmov d0, x0 893; CHECK-NEXT: abs d0, d0 894; CHECK-NEXT: fmov x0, d0 895; CHECK-NEXT: ret 896 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 897 ret i64 %abs 898} 899 900declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 901declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 902declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 903declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 904declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 905declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 906declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 907declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 908 909; FALLBACK-NOT: remark:{{.*}} sabal8h 910define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 911; CHECK-LABEL: sabal8h: 912; CHECK: // %bb.0: 913; CHECK-NEXT: ldr d1, [x0] 914; CHECK-NEXT: ldr d2, [x1] 915; CHECK-NEXT: ldr q0, [x2] 916; CHECK-NEXT: sabal.8h v0, v1, v2 917; CHECK-NEXT: ret 918 %tmp1 = load <8 x i8>, <8 x i8>* %A 919 %tmp2 = load <8 x i8>, <8 x i8>* %B 920 %tmp3 = load <8 x i16>, <8 x i16>* %C 921 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 922 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 923 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 924 ret <8 x i16> %tmp5 925} 926 927; FALLBACK-NOT: remark:{{.*}} sabal4s 928define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 929; CHECK-LABEL: sabal4s: 930; CHECK: // %bb.0: 931; CHECK-NEXT: ldr d1, [x0] 932; CHECK-NEXT: ldr d2, [x1] 933; CHECK-NEXT: ldr q0, [x2] 934; CHECK-NEXT: sabal.4s v0, v1, v2 935; CHECK-NEXT: ret 936 %tmp1 = load <4 x i16>, <4 x i16>* %A 937 %tmp2 = load <4 x i16>, <4 x i16>* %B 938 %tmp3 = load <4 x i32>, <4 x i32>* %C 939 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 940 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 941 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 942 ret <4 x i32> %tmp5 943} 944 945; FALLBACK-NOT: remark:{{.*}} sabal2d 946define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 947; CHECK-LABEL: sabal2d: 948; CHECK: // %bb.0: 949; CHECK-NEXT: ldr d1, [x0] 950; CHECK-NEXT: ldr d2, [x1] 951; CHECK-NEXT: ldr q0, [x2] 952; CHECK-NEXT: sabal.2d v0, v1, v2 953; CHECK-NEXT: ret 954 %tmp1 = load <2 x i32>, <2 x i32>* %A 955 %tmp2 = load <2 x i32>, <2 x i32>* %B 956 %tmp3 = load <2 x i64>, <2 x i64>* %C 957 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 958 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 959 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 960 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 961 ret <2 x i64> %tmp5 962} 963 964define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 965; CHECK-LABEL: sabal2_8h: 966; CHECK: // %bb.0: 967; CHECK-NEXT: ldr q0, [x2] 968; CHECK-NEXT: ldr d1, [x0, #8] 969; CHECK-NEXT: ldr d2, [x1, #8] 970; CHECK-NEXT: sabal.8h v0, v1, v2 971; CHECK-NEXT: ret 972 %load1 = load <16 x i8>, <16 x i8>* %A 973 %load2 = load <16 x i8>, <16 x i8>* %B 974 %tmp3 = load <8 x i16>, <8 x i16>* %C 975 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 976 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 977 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 978 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 979 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 980 ret <8 x i16> %tmp5 981} 982 983define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 984; CHECK-LABEL: sabal2_4s: 985; CHECK: // %bb.0: 986; CHECK-NEXT: ldr q0, [x2] 987; CHECK-NEXT: ldr d1, [x0, #8] 988; CHECK-NEXT: ldr d2, [x1, #8] 989; CHECK-NEXT: sabal.4s v0, v1, v2 990; CHECK-NEXT: ret 991 %load1 = load <8 x i16>, <8 x i16>* %A 992 %load2 = load <8 x i16>, <8 x i16>* %B 993 %tmp3 = load <4 x i32>, <4 x i32>* %C 994 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 995 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 996 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 997 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 998 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 999 ret <4 x i32> %tmp5 1000} 1001 1002define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1003; CHECK-LABEL: sabal2_2d: 1004; CHECK: // %bb.0: 1005; CHECK-NEXT: ldr q0, [x2] 1006; CHECK-NEXT: ldr d1, [x0, #8] 1007; CHECK-NEXT: ldr d2, [x1, #8] 1008; CHECK-NEXT: sabal.2d v0, v1, v2 1009; CHECK-NEXT: ret 1010 %load1 = load <4 x i32>, <4 x i32>* %A 1011 %load2 = load <4 x i32>, <4 x i32>* %B 1012 %tmp3 = load <2 x i64>, <2 x i64>* %C 1013 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1014 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1015 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 1016 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 1017 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 1018 ret <2 x i64> %tmp5 1019} 1020 1021; FALLBACK-NOT: remark:{{.*}} uabal8h 1022define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 1023; CHECK-LABEL: uabal8h: 1024; CHECK: // %bb.0: 1025; CHECK-NEXT: ldr d1, [x0] 1026; CHECK-NEXT: ldr d2, [x1] 1027; CHECK-NEXT: ldr q0, [x2] 1028; CHECK-NEXT: uabal.8h v0, v1, v2 1029; CHECK-NEXT: ret 1030 %tmp1 = load <8 x i8>, <8 x i8>* %A 1031 %tmp2 = load <8 x i8>, <8 x i8>* %B 1032 %tmp3 = load <8 x i16>, <8 x i16>* %C 1033 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 1034 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 1035 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 1036 ret <8 x i16> %tmp5 1037} 1038 1039; FALLBACK-NOT: remark:{{.*}} uabal8s 1040define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1041; CHECK-LABEL: uabal4s: 1042; CHECK: // %bb.0: 1043; CHECK-NEXT: ldr d1, [x0] 1044; CHECK-NEXT: ldr d2, [x1] 1045; CHECK-NEXT: ldr q0, [x2] 1046; CHECK-NEXT: uabal.4s v0, v1, v2 1047; CHECK-NEXT: ret 1048 %tmp1 = load <4 x i16>, <4 x i16>* %A 1049 %tmp2 = load <4 x i16>, <4 x i16>* %B 1050 %tmp3 = load <4 x i32>, <4 x i32>* %C 1051 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 1052 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 1053 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 1054 ret <4 x i32> %tmp5 1055} 1056 1057; FALLBACK-NOT: remark:{{.*}} uabal2d 1058define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1059; CHECK-LABEL: uabal2d: 1060; CHECK: // %bb.0: 1061; CHECK-NEXT: ldr d1, [x0] 1062; CHECK-NEXT: ldr d2, [x1] 1063; CHECK-NEXT: ldr q0, [x2] 1064; CHECK-NEXT: uabal.2d v0, v1, v2 1065; CHECK-NEXT: ret 1066 %tmp1 = load <2 x i32>, <2 x i32>* %A 1067 %tmp2 = load <2 x i32>, <2 x i32>* %B 1068 %tmp3 = load <2 x i64>, <2 x i64>* %C 1069 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 1070 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 1071 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 1072 ret <2 x i64> %tmp5 1073} 1074 1075define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 1076; CHECK-LABEL: uabal2_8h: 1077; CHECK: // %bb.0: 1078; CHECK-NEXT: ldr q0, [x2] 1079; CHECK-NEXT: ldr d1, [x0, #8] 1080; CHECK-NEXT: ldr d2, [x1, #8] 1081; CHECK-NEXT: uabal.8h v0, v1, v2 1082; CHECK-NEXT: ret 1083 %load1 = load <16 x i8>, <16 x i8>* %A 1084 %load2 = load <16 x i8>, <16 x i8>* %B 1085 %tmp3 = load <8 x i16>, <8 x i16>* %C 1086 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1087 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1088 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 1089 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 1090 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 1091 ret <8 x i16> %tmp5 1092} 1093 1094define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 1095; CHECK-LABEL: uabal2_4s: 1096; CHECK: // %bb.0: 1097; CHECK-NEXT: ldr q0, [x2] 1098; CHECK-NEXT: ldr d1, [x0, #8] 1099; CHECK-NEXT: ldr d2, [x1, #8] 1100; CHECK-NEXT: uabal.4s v0, v1, v2 1101; CHECK-NEXT: ret 1102 %load1 = load <8 x i16>, <8 x i16>* %A 1103 %load2 = load <8 x i16>, <8 x i16>* %B 1104 %tmp3 = load <4 x i32>, <4 x i32>* %C 1105 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1106 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1107 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 1108 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 1109 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 1110 ret <4 x i32> %tmp5 1111} 1112 1113define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1114; CHECK-LABEL: uabal2_2d: 1115; CHECK: // %bb.0: 1116; CHECK-NEXT: ldr q0, [x2] 1117; CHECK-NEXT: ldr d1, [x0, #8] 1118; CHECK-NEXT: ldr d2, [x1, #8] 1119; CHECK-NEXT: uabal.2d v0, v1, v2 1120; CHECK-NEXT: ret 1121 %load1 = load <4 x i32>, <4 x i32>* %A 1122 %load2 = load <4 x i32>, <4 x i32>* %B 1123 %tmp3 = load <2 x i64>, <2 x i64>* %C 1124 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1125 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1126 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 1127 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 1128 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 1129 ret <2 x i64> %tmp5 1130} 1131 1132define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 1133; CHECK-LABEL: saba_8b: 1134; CHECK: // %bb.0: 1135; CHECK-NEXT: ldr d1, [x0] 1136; CHECK-NEXT: ldr d2, [x1] 1137; CHECK-NEXT: ldr d0, [x2] 1138; CHECK-NEXT: saba.8b v0, v1, v2 1139; CHECK-NEXT: ret 1140 %tmp1 = load <8 x i8>, <8 x i8>* %A 1141 %tmp2 = load <8 x i8>, <8 x i8>* %B 1142 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 1143 %tmp4 = load <8 x i8>, <8 x i8>* %C 1144 %tmp5 = add <8 x i8> %tmp3, %tmp4 1145 ret <8 x i8> %tmp5 1146} 1147 1148define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 1149; CHECK-LABEL: saba_16b: 1150; CHECK: // %bb.0: 1151; CHECK-NEXT: ldr q1, [x0] 1152; CHECK-NEXT: ldr q2, [x1] 1153; CHECK-NEXT: ldr q0, [x2] 1154; CHECK-NEXT: saba.16b v0, v1, v2 1155; CHECK-NEXT: ret 1156 %tmp1 = load <16 x i8>, <16 x i8>* %A 1157 %tmp2 = load <16 x i8>, <16 x i8>* %B 1158 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 1159 %tmp4 = load <16 x i8>, <16 x i8>* %C 1160 %tmp5 = add <16 x i8> %tmp3, %tmp4 1161 ret <16 x i8> %tmp5 1162} 1163 1164define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 1165; CHECK-LABEL: saba_4h: 1166; CHECK: // %bb.0: 1167; CHECK-NEXT: ldr d1, [x0] 1168; CHECK-NEXT: ldr d2, [x1] 1169; CHECK-NEXT: ldr d0, [x2] 1170; CHECK-NEXT: saba.4h v0, v1, v2 1171; CHECK-NEXT: ret 1172 %tmp1 = load <4 x i16>, <4 x i16>* %A 1173 %tmp2 = load <4 x i16>, <4 x i16>* %B 1174 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 1175 %tmp4 = load <4 x i16>, <4 x i16>* %C 1176 %tmp5 = add <4 x i16> %tmp3, %tmp4 1177 ret <4 x i16> %tmp5 1178} 1179 1180define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 1181; CHECK-LABEL: saba_8h: 1182; CHECK: // %bb.0: 1183; CHECK-NEXT: ldr q1, [x0] 1184; CHECK-NEXT: ldr q2, [x1] 1185; CHECK-NEXT: ldr q0, [x2] 1186; CHECK-NEXT: saba.8h v0, v1, v2 1187; CHECK-NEXT: ret 1188 %tmp1 = load <8 x i16>, <8 x i16>* %A 1189 %tmp2 = load <8 x i16>, <8 x i16>* %B 1190 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 1191 %tmp4 = load <8 x i16>, <8 x i16>* %C 1192 %tmp5 = add <8 x i16> %tmp3, %tmp4 1193 ret <8 x i16> %tmp5 1194} 1195 1196define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 1197; CHECK-LABEL: saba_2s: 1198; CHECK: // %bb.0: 1199; CHECK-NEXT: ldr d1, [x0] 1200; CHECK-NEXT: ldr d2, [x1] 1201; CHECK-NEXT: ldr d0, [x2] 1202; CHECK-NEXT: saba.2s v0, v1, v2 1203; CHECK-NEXT: ret 1204 %tmp1 = load <2 x i32>, <2 x i32>* %A 1205 %tmp2 = load <2 x i32>, <2 x i32>* %B 1206 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 1207 %tmp4 = load <2 x i32>, <2 x i32>* %C 1208 %tmp5 = add <2 x i32> %tmp3, %tmp4 1209 ret <2 x i32> %tmp5 1210} 1211 1212define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 1213; CHECK-LABEL: saba_4s: 1214; CHECK: // %bb.0: 1215; CHECK-NEXT: ldr q1, [x0] 1216; CHECK-NEXT: ldr q2, [x1] 1217; CHECK-NEXT: ldr q0, [x2] 1218; CHECK-NEXT: saba.4s v0, v1, v2 1219; CHECK-NEXT: ret 1220 %tmp1 = load <4 x i32>, <4 x i32>* %A 1221 %tmp2 = load <4 x i32>, <4 x i32>* %B 1222 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 1223 %tmp4 = load <4 x i32>, <4 x i32>* %C 1224 %tmp5 = add <4 x i32> %tmp3, %tmp4 1225 ret <4 x i32> %tmp5 1226} 1227 1228define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 1229; CHECK-LABEL: uaba_8b: 1230; CHECK: // %bb.0: 1231; CHECK-NEXT: ldr d1, [x0] 1232; CHECK-NEXT: ldr d2, [x1] 1233; CHECK-NEXT: ldr d0, [x2] 1234; CHECK-NEXT: uaba.8b v0, v1, v2 1235; CHECK-NEXT: ret 1236 %tmp1 = load <8 x i8>, <8 x i8>* %A 1237 %tmp2 = load <8 x i8>, <8 x i8>* %B 1238 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 1239 %tmp4 = load <8 x i8>, <8 x i8>* %C 1240 %tmp5 = add <8 x i8> %tmp3, %tmp4 1241 ret <8 x i8> %tmp5 1242} 1243 1244define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 1245; CHECK-LABEL: uaba_16b: 1246; CHECK: // %bb.0: 1247; CHECK-NEXT: ldr q1, [x0] 1248; CHECK-NEXT: ldr q2, [x1] 1249; CHECK-NEXT: ldr q0, [x2] 1250; CHECK-NEXT: uaba.16b v0, v1, v2 1251; CHECK-NEXT: ret 1252 %tmp1 = load <16 x i8>, <16 x i8>* %A 1253 %tmp2 = load <16 x i8>, <16 x i8>* %B 1254 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 1255 %tmp4 = load <16 x i8>, <16 x i8>* %C 1256 %tmp5 = add <16 x i8> %tmp3, %tmp4 1257 ret <16 x i8> %tmp5 1258} 1259 1260define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 1261; CHECK-LABEL: uaba_4h: 1262; CHECK: // %bb.0: 1263; CHECK-NEXT: ldr d1, [x0] 1264; CHECK-NEXT: ldr d2, [x1] 1265; CHECK-NEXT: ldr d0, [x2] 1266; CHECK-NEXT: uaba.4h v0, v1, v2 1267; CHECK-NEXT: ret 1268 %tmp1 = load <4 x i16>, <4 x i16>* %A 1269 %tmp2 = load <4 x i16>, <4 x i16>* %B 1270 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 1271 %tmp4 = load <4 x i16>, <4 x i16>* %C 1272 %tmp5 = add <4 x i16> %tmp3, %tmp4 1273 ret <4 x i16> %tmp5 1274} 1275 1276define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 1277; CHECK-LABEL: uaba_8h: 1278; CHECK: // %bb.0: 1279; CHECK-NEXT: ldr q1, [x0] 1280; CHECK-NEXT: ldr q2, [x1] 1281; CHECK-NEXT: ldr q0, [x2] 1282; CHECK-NEXT: uaba.8h v0, v1, v2 1283; CHECK-NEXT: ret 1284 %tmp1 = load <8 x i16>, <8 x i16>* %A 1285 %tmp2 = load <8 x i16>, <8 x i16>* %B 1286 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 1287 %tmp4 = load <8 x i16>, <8 x i16>* %C 1288 %tmp5 = add <8 x i16> %tmp3, %tmp4 1289 ret <8 x i16> %tmp5 1290} 1291 1292define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 1293; CHECK-LABEL: uaba_2s: 1294; CHECK: // %bb.0: 1295; CHECK-NEXT: ldr d1, [x0] 1296; CHECK-NEXT: ldr d2, [x1] 1297; CHECK-NEXT: ldr d0, [x2] 1298; CHECK-NEXT: uaba.2s v0, v1, v2 1299; CHECK-NEXT: ret 1300 %tmp1 = load <2 x i32>, <2 x i32>* %A 1301 %tmp2 = load <2 x i32>, <2 x i32>* %B 1302 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 1303 %tmp4 = load <2 x i32>, <2 x i32>* %C 1304 %tmp5 = add <2 x i32> %tmp3, %tmp4 1305 ret <2 x i32> %tmp5 1306} 1307 1308define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 1309; CHECK-LABEL: uaba_4s: 1310; CHECK: // %bb.0: 1311; CHECK-NEXT: ldr q1, [x0] 1312; CHECK-NEXT: ldr q2, [x1] 1313; CHECK-NEXT: ldr q0, [x2] 1314; CHECK-NEXT: uaba.4s v0, v1, v2 1315; CHECK-NEXT: ret 1316 %tmp1 = load <4 x i32>, <4 x i32>* %A 1317 %tmp2 = load <4 x i32>, <4 x i32>* %B 1318 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 1319 %tmp4 = load <4 x i32>, <4 x i32>* %C 1320 %tmp5 = add <4 x i32> %tmp3, %tmp4 1321 ret <4 x i32> %tmp5 1322} 1323 1324; Scalar FABD 1325define float @fabds(float %a, float %b) nounwind { 1326; CHECK-LABEL: fabds: 1327; CHECK: // %bb.0: 1328; CHECK-NEXT: fabd s0, s0, s1 1329; CHECK-NEXT: ret 1330 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 1331 ret float %vabd.i 1332} 1333 1334define double @fabdd(double %a, double %b) nounwind { 1335; CHECK-LABEL: fabdd: 1336; CHECK: // %bb.0: 1337; CHECK-NEXT: fabd d0, d0, d1 1338; CHECK-NEXT: ret 1339 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 1340 ret double %vabd.i 1341} 1342 1343declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 1344declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 1345 1346define float @fabds_from_fsub_fabs(float %a, float %b) nounwind { 1347; CHECK-LABEL: fabds_from_fsub_fabs: 1348; CHECK: // %bb.0: 1349; CHECK-NEXT: fabd s0, s0, s1 1350; CHECK-NEXT: ret 1351 %sub = fsub float %a, %b 1352 %abs = tail call float @llvm.fabs.f32(float %sub) 1353 ret float %abs 1354} 1355 1356define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind { 1357; CHECK-LABEL: fabdd_from_fsub_fabs: 1358; CHECK: // %bb.0: 1359; CHECK-NEXT: fabd d0, d0, d1 1360; CHECK-NEXT: ret 1361 %sub = fsub double %a, %b 1362 %abs = tail call double @llvm.fabs.f64(double %sub) 1363 ret double %abs 1364} 1365 1366declare float @llvm.fabs.f32(float) nounwind readnone 1367declare double @llvm.fabs.f64(double) nounwind readnone 1368 1369define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1370; CHECK-LABEL: uabdl_from_extract_dup: 1371; CHECK: // %bb.0: 1372; CHECK-NEXT: dup.2s v1, w0 1373; CHECK-NEXT: uabdl.2d v0, v0, v1 1374; CHECK-NEXT: ret 1375 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1376 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1377 1378 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1379 1380 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1381 %res1 = zext <2 x i32> %res to <2 x i64> 1382 ret <2 x i64> %res1 1383} 1384 1385define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1386; CHECK-LABEL: uabdl2_from_extract_dup: 1387; CHECK: // %bb.0: 1388; CHECK-NEXT: dup.4s v1, w0 1389; CHECK-NEXT: uabdl2.2d v0, v0, v1 1390; CHECK-NEXT: ret 1391 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1392 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1393 1394 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1395 1396 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1397 %res1 = zext <2 x i32> %res to <2 x i64> 1398 ret <2 x i64> %res1 1399} 1400 1401define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1402; CHECK-LABEL: sabdl_from_extract_dup: 1403; CHECK: // %bb.0: 1404; CHECK-NEXT: dup.2s v1, w0 1405; CHECK-NEXT: sabdl.2d v0, v0, v1 1406; CHECK-NEXT: ret 1407 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1408 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1409 1410 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1411 1412 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1413 %res1 = zext <2 x i32> %res to <2 x i64> 1414 ret <2 x i64> %res1 1415} 1416 1417define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1418; CHECK-LABEL: sabdl2_from_extract_dup: 1419; CHECK: // %bb.0: 1420; CHECK-NEXT: dup.4s v1, w0 1421; CHECK-NEXT: sabdl2.2d v0, v0, v1 1422; CHECK-NEXT: ret 1423 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1424 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1425 1426 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1427 1428 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1429 %res1 = zext <2 x i32> %res to <2 x i64> 1430 ret <2 x i64> %res1 1431} 1432 1433define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 1434; DAG-LABEL: abspattern1: 1435; DAG: // %bb.0: 1436; DAG-NEXT: abs.2s v0, v0 1437; DAG-NEXT: ret 1438; 1439; GISEL-LABEL: abspattern1: 1440; GISEL: // %bb.0: 1441; GISEL-NEXT: movi.2d v1, #0000000000000000 1442; GISEL-NEXT: cmge.2s v1, v0, v1 1443; GISEL-NEXT: shl.2s v1, v1, #31 1444; GISEL-NEXT: neg.2s v2, v0 1445; GISEL-NEXT: sshr.2s v1, v1, #31 1446; GISEL-NEXT: bif.8b v0, v2, v1 1447; GISEL-NEXT: ret 1448 1449 %tmp1neg = sub <2 x i32> zeroinitializer, %a 1450 %b = icmp sge <2 x i32> %a, zeroinitializer 1451 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 1452 ret <2 x i32> %abs 1453} 1454 1455define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 1456; DAG-LABEL: abspattern2: 1457; DAG: // %bb.0: 1458; DAG-NEXT: abs.4h v0, v0 1459; DAG-NEXT: ret 1460; 1461; GISEL-LABEL: abspattern2: 1462; GISEL: // %bb.0: 1463; GISEL-NEXT: movi.2d v1, #0000000000000000 1464; GISEL-NEXT: cmgt.4h v1, v0, v1 1465; GISEL-NEXT: shl.4h v1, v1, #15 1466; GISEL-NEXT: neg.4h v2, v0 1467; GISEL-NEXT: sshr.4h v1, v1, #15 1468; GISEL-NEXT: bif.8b v0, v2, v1 1469; GISEL-NEXT: ret 1470; For GlobalISel, this generates terrible code until we can pattern match this to abs. 1471 1472 %tmp1neg = sub <4 x i16> zeroinitializer, %a 1473 %b = icmp sgt <4 x i16> %a, zeroinitializer 1474 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 1475 ret <4 x i16> %abs 1476} 1477 1478define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 1479; DAG-LABEL: abspattern3: 1480; DAG: // %bb.0: 1481; DAG-NEXT: abs.8b v0, v0 1482; DAG-NEXT: ret 1483; 1484; GISEL-LABEL: abspattern3: 1485; GISEL: // %bb.0: 1486; GISEL-NEXT: movi.2d v1, #0000000000000000 1487; GISEL-NEXT: cmgt.8b v1, v1, v0 1488; GISEL-NEXT: shl.8b v1, v1, #7 1489; GISEL-NEXT: neg.8b v2, v0 1490; GISEL-NEXT: sshr.8b v1, v1, #7 1491; GISEL-NEXT: bit.8b v0, v2, v1 1492; GISEL-NEXT: ret 1493 1494 %tmp1neg = sub <8 x i8> zeroinitializer, %a 1495 %b = icmp slt <8 x i8> %a, zeroinitializer 1496 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 1497 ret <8 x i8> %abs 1498} 1499 1500define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 1501; DAG-LABEL: abspattern4: 1502; DAG: // %bb.0: 1503; DAG-NEXT: abs.4s v0, v0 1504; DAG-NEXT: ret 1505; 1506; GISEL-LABEL: abspattern4: 1507; GISEL: // %bb.0: 1508; GISEL-NEXT: movi.2d v1, #0000000000000000 1509; GISEL-NEXT: cmge.4s v1, v0, v1 1510; GISEL-NEXT: shl.4s v1, v1, #31 1511; GISEL-NEXT: neg.4s v2, v0 1512; GISEL-NEXT: sshr.4s v1, v1, #31 1513; GISEL-NEXT: bif.16b v0, v2, v1 1514; GISEL-NEXT: ret 1515 1516 %tmp1neg = sub <4 x i32> zeroinitializer, %a 1517 %b = icmp sge <4 x i32> %a, zeroinitializer 1518 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 1519 ret <4 x i32> %abs 1520} 1521 1522define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 1523; DAG-LABEL: abspattern5: 1524; DAG: // %bb.0: 1525; DAG-NEXT: abs.8h v0, v0 1526; DAG-NEXT: ret 1527; 1528; GISEL-LABEL: abspattern5: 1529; GISEL: // %bb.0: 1530; GISEL-NEXT: movi.2d v1, #0000000000000000 1531; GISEL-NEXT: cmgt.8h v1, v0, v1 1532; GISEL-NEXT: shl.8h v1, v1, #15 1533; GISEL-NEXT: neg.8h v2, v0 1534; GISEL-NEXT: sshr.8h v1, v1, #15 1535; GISEL-NEXT: bif.16b v0, v2, v1 1536; GISEL-NEXT: ret 1537 1538 %tmp1neg = sub <8 x i16> zeroinitializer, %a 1539 %b = icmp sgt <8 x i16> %a, zeroinitializer 1540 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 1541 ret <8 x i16> %abs 1542} 1543 1544define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 1545; DAG-LABEL: abspattern6: 1546; DAG: // %bb.0: 1547; DAG-NEXT: abs.16b v0, v0 1548; DAG-NEXT: ret 1549; 1550; GISEL-LABEL: abspattern6: 1551; GISEL: // %bb.0: 1552; GISEL-NEXT: movi.2d v1, #0000000000000000 1553; GISEL-NEXT: cmgt.16b v1, v1, v0 1554; GISEL-NEXT: shl.16b v1, v1, #7 1555; GISEL-NEXT: neg.16b v2, v0 1556; GISEL-NEXT: sshr.16b v1, v1, #7 1557; GISEL-NEXT: bit.16b v0, v2, v1 1558; GISEL-NEXT: ret 1559 1560 %tmp1neg = sub <16 x i8> zeroinitializer, %a 1561 %b = icmp slt <16 x i8> %a, zeroinitializer 1562 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 1563 ret <16 x i8> %abs 1564} 1565 1566define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 1567; DAG-LABEL: abspattern7: 1568; DAG: // %bb.0: 1569; DAG-NEXT: abs.2d v0, v0 1570; DAG-NEXT: ret 1571; 1572; GISEL-LABEL: abspattern7: 1573; GISEL: // %bb.0: 1574; GISEL-NEXT: movi.2d v1, #0000000000000000 1575; GISEL-NEXT: cmge.2d v1, v1, v0 1576; GISEL-NEXT: shl.2d v1, v1, #63 1577; GISEL-NEXT: neg.2d v2, v0 1578; GISEL-NEXT: sshr.2d v1, v1, #63 1579; GISEL-NEXT: bit.16b v0, v2, v1 1580; GISEL-NEXT: ret 1581 1582 %tmp1neg = sub <2 x i64> zeroinitializer, %a 1583 %b = icmp sle <2 x i64> %a, zeroinitializer 1584 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 1585 ret <2 x i64> %abs 1586} 1587 1588define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) { 1589; DAG-LABEL: uabd_i32: 1590; DAG: // %bb.0: 1591; DAG-NEXT: sabdl.2d v0, v0, v1 1592; DAG-NEXT: ret 1593; 1594; GISEL-LABEL: uabd_i32: 1595; GISEL: // %bb.0: 1596; GISEL-NEXT: movi.2d v2, #0000000000000000 1597; GISEL-NEXT: ssubl.2d v0, v0, v1 1598; GISEL-NEXT: cmgt.2d v1, v2, v0 1599; GISEL-NEXT: shl.2d v1, v1, #63 1600; GISEL-NEXT: neg.2d v2, v0 1601; GISEL-NEXT: sshr.2d v1, v1, #63 1602; GISEL-NEXT: bit.16b v0, v2, v1 1603; GISEL-NEXT: ret 1604 %aext = sext <2 x i32> %a to <2 x i64> 1605 %bext = sext <2 x i32> %b to <2 x i64> 1606 %abdiff = sub nsw <2 x i64> %aext, %bext 1607 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer 1608 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff 1609 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff 1610 ret <2 x i64> %absel 1611} 1612 1613 1614define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) { 1615; CHECK-LABEL: uabd_i64: 1616; CHECK: // %bb.0: 1617; CHECK-NEXT: fmov x9, d0 1618; CHECK-NEXT: fmov x12, d1 1619; CHECK-NEXT: asr x10, x9, #63 1620; CHECK-NEXT: asr x13, x12, #63 1621; CHECK-NEXT: subs x9, x9, x12 1622; CHECK-NEXT: mov.d x8, v0[1] 1623; CHECK-NEXT: mov.d x11, v1[1] 1624; CHECK-NEXT: sbcs x10, x10, x13 1625; CHECK-NEXT: asr x12, x8, #63 1626; CHECK-NEXT: asr x14, x11, #63 1627; CHECK-NEXT: subs x8, x8, x11 1628; CHECK-NEXT: sbcs x11, x12, x14 1629; CHECK-NEXT: negs x12, x8 1630; CHECK-NEXT: ngcs x13, x11 1631; CHECK-NEXT: cmp x11, #0 // =0 1632; CHECK-NEXT: csel x2, x12, x8, lt 1633; CHECK-NEXT: csel x3, x13, x11, lt 1634; CHECK-NEXT: negs x8, x9 1635; CHECK-NEXT: ngcs x11, x10 1636; CHECK-NEXT: cmp x10, #0 // =0 1637; CHECK-NEXT: csel x8, x8, x9, lt 1638; CHECK-NEXT: csel x1, x11, x10, lt 1639; CHECK-NEXT: fmov d0, x8 1640; CHECK-NEXT: mov.d v0[1], x1 1641; CHECK-NEXT: fmov x0, d0 1642; CHECK-NEXT: ret 1643 %aext = sext <2 x i64> %a to <2 x i128> 1644 %bext = sext <2 x i64> %b to <2 x i128> 1645 %abdiff = sub nsw <2 x i128> %aext, %bext 1646 %abcmp = icmp slt <2 x i128> %abdiff, zeroinitializer 1647 %ababs = sub nsw <2 x i128> zeroinitializer, %abdiff 1648 %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff 1649 ret <2 x i128> %absel 1650} 1651