1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s 2; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=FALLBACK,CHECK,GISEL 3 4; FALLBACK-NOT: remark:{{.*}} G_ZEXT 5; FALLBACK-NOT: remark:{{.*}} sabdl8h 6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 7;CHECK-LABEL: sabdl8h: 8;CHECK: sabdl.8h 9 %tmp1 = load <8 x i8>, <8 x i8>* %A 10 %tmp2 = load <8 x i8>, <8 x i8>* %B 11 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 12 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 13 ret <8 x i16> %tmp4 14} 15 16; FALLBACK-NOT: remark:{{.*}} sabdl4s 17define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 18;CHECK-LABEL: sabdl4s: 19;CHECK: sabdl.4s 20 %tmp1 = load <4 x i16>, <4 x i16>* %A 21 %tmp2 = load <4 x i16>, <4 x i16>* %B 22 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 23 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 24 ret <4 x i32> %tmp4 25} 26 27; FALLBACK-NOT: remark:{{.*}} sabdl2d 28define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 29;CHECK-LABEL: sabdl2d: 30;CHECK: sabdl.2d 31 %tmp1 = load <2 x i32>, <2 x i32>* %A 32 %tmp2 = load <2 x i32>, <2 x i32>* %B 33 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 34 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 35 ret <2 x i64> %tmp4 36} 37 38define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 39;CHECK-LABEL: sabdl2_8h: 40;CHECK: sabdl.8h 41 %load1 = load <16 x i8>, <16 x i8>* %A 42 %load2 = load <16 x i8>, <16 x i8>* %B 43 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 44 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 45 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 46 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 47 ret <8 x i16> %tmp4 48} 49 50define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 51;CHECK-LABEL: sabdl2_4s: 52;CHECK: sabdl.4s 53 %load1 = load <8 x i16>, <8 x i16>* %A 54 %load2 = load <8 x i16>, <8 x i16>* %B 55 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 56 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 57 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 58 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 59 ret <4 x i32> %tmp4 60} 61 62define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 63;CHECK-LABEL: sabdl2_2d: 64;CHECK: sabdl.2d 65 %load1 = load <4 x i32>, <4 x i32>* %A 66 %load2 = load <4 x i32>, <4 x i32>* %B 67 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 68 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 69 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 70 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 71 ret <2 x i64> %tmp4 72} 73 74; FALLBACK-NOT: remark:{{.*}} uabdl8h) 75define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 76;CHECK-LABEL: uabdl8h: 77;CHECK: uabdl.8h 78 %tmp1 = load <8 x i8>, <8 x i8>* %A 79 %tmp2 = load <8 x i8>, <8 x i8>* %B 80 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 81 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 82 ret <8 x i16> %tmp4 83} 84 85; FALLBACK-NOT: remark:{{.*}} uabdl4s) 86define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 87;CHECK-LABEL: uabdl4s: 88;CHECK: uabdl.4s 89 %tmp1 = load <4 x i16>, <4 x i16>* %A 90 %tmp2 = load <4 x i16>, <4 x i16>* %B 91 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 92 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 93 ret <4 x i32> %tmp4 94} 95 96; FALLBACK-NOT: remark:{{.*}} uabdl2d) 97define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 98;CHECK-LABEL: uabdl2d: 99;CHECK: uabdl.2d 100 %tmp1 = load <2 x i32>, <2 x i32>* %A 101 %tmp2 = load <2 x i32>, <2 x i32>* %B 102 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 103 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 104 ret <2 x i64> %tmp4 105} 106 107define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 108;CHECK-LABEL: uabdl2_8h: 109;CHECK: uabdl.8h 110 %load1 = load <16 x i8>, <16 x i8>* %A 111 %load2 = load <16 x i8>, <16 x i8>* %B 112 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 113 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 114 115 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 116 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 117 ret <8 x i16> %tmp4 118} 119 120define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 121;CHECK-LABEL: uabdl2_4s: 122;CHECK: uabdl.4s 123 %load1 = load <8 x i16>, <8 x i16>* %A 124 %load2 = load <8 x i16>, <8 x i16>* %B 125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 127 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 128 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 129 ret <4 x i32> %tmp4 130} 131 132define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 133;CHECK-LABEL: uabdl2_2d: 134;CHECK: uabdl.2d 135 %load1 = load <4 x i32>, <4 x i32>* %A 136 %load2 = load <4 x i32>, <4 x i32>* %B 137 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 138 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 139 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 140 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 141 ret <2 x i64> %tmp4 142} 143 144declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) 145 146define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { 147; CHECK-LABEL: uabdl8h_rdx 148; CHECK: uabdl2.8h 149; CHECK: uabdl.8h 150 %aload = load <16 x i8>, <16 x i8>* %a, align 1 151 %bload = load <16 x i8>, <16 x i8>* %b, align 1 152 %aext = zext <16 x i8> %aload to <16 x i16> 153 %bext = zext <16 x i8> %bload to <16 x i16> 154 %abdiff = sub nsw <16 x i16> %aext, %bext 155 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 156 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 157 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 158 %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel) 159 ret i16 %reduced_v 160} 161 162declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) 163 164define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { 165; CHECK-LABEL: uabdl4s_rdx 166; CHECK: uabdl2.4s 167; CHECK: uabdl.4s 168 %aload = load <8 x i16>, <8 x i16>* %a, align 1 169 %bload = load <8 x i16>, <8 x i16>* %b, align 1 170 %aext = zext <8 x i16> %aload to <8 x i32> 171 %bext = zext <8 x i16> %bload to <8 x i32> 172 %abdiff = sub nsw <8 x i32> %aext, %bext 173 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 174 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 175 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 176 %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel) 177 ret i32 %reduced_v 178} 179 180declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) 181 182define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 183; CHECK: uabdl2d_rdx 184; CHECK: uabdl2.2d 185; CHECK: uabdl.2d 186 %aload = load <4 x i32>, <4 x i32>* %a, align 1 187 %bload = load <4 x i32>, <4 x i32>* %b, align 1 188 %aext = zext <4 x i32> %aload to <4 x i64> 189 %bext = zext <4 x i32> %bload to <4 x i64> 190 %abdiff = sub nsw <4 x i64> %aext, %bext 191 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 192 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 193 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 194 %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel) 195 ret i64 %reduced_v 196} 197 198define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 199;CHECK-LABEL: fabd_2s: 200;CHECK: fabd.2s 201 %tmp1 = load <2 x float>, <2 x float>* %A 202 %tmp2 = load <2 x float>, <2 x float>* %B 203 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 204 ret <2 x float> %tmp3 205} 206 207define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 208;CHECK-LABEL: fabd_4s: 209;CHECK: fabd.4s 210 %tmp1 = load <4 x float>, <4 x float>* %A 211 %tmp2 = load <4 x float>, <4 x float>* %B 212 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 213 ret <4 x float> %tmp3 214} 215 216define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 217;CHECK-LABEL: fabd_2d: 218;CHECK: fabd.2d 219 %tmp1 = load <2 x double>, <2 x double>* %A 220 %tmp2 = load <2 x double>, <2 x double>* %B 221 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 222 ret <2 x double> %tmp3 223} 224 225declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 226declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 227declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 228 229define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind { 230;CHECK-LABEL: fabd_2s_from_fsub_fabs: 231;CHECK: fabd.2s 232 %tmp1 = load <2 x float>, <2 x float>* %A 233 %tmp2 = load <2 x float>, <2 x float>* %B 234 %sub = fsub <2 x float> %tmp1, %tmp2 235 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub) 236 ret <2 x float> %abs 237} 238 239define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind { 240;CHECK-LABEL: fabd_4s_from_fsub_fabs: 241;CHECK: fabd.4s 242 %tmp1 = load <4 x float>, <4 x float>* %A 243 %tmp2 = load <4 x float>, <4 x float>* %B 244 %sub = fsub <4 x float> %tmp1, %tmp2 245 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub) 246 ret <4 x float> %abs 247} 248 249define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind { 250;CHECK-LABEL: fabd_2d_from_fsub_fabs: 251;CHECK: fabd.2d 252 %tmp1 = load <2 x double>, <2 x double>* %A 253 %tmp2 = load <2 x double>, <2 x double>* %B 254 %sub = fsub <2 x double> %tmp1, %tmp2 255 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub) 256 ret <2 x double> %abs 257} 258 259declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone 260declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone 261declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone 262 263define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 264;CHECK-LABEL: sabd_8b: 265;CHECK: sabd.8b 266 %tmp1 = load <8 x i8>, <8 x i8>* %A 267 %tmp2 = load <8 x i8>, <8 x i8>* %B 268 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 269 ret <8 x i8> %tmp3 270} 271 272define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 273;CHECK-LABEL: sabd_16b: 274;CHECK: sabd.16b 275 %tmp1 = load <16 x i8>, <16 x i8>* %A 276 %tmp2 = load <16 x i8>, <16 x i8>* %B 277 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 278 ret <16 x i8> %tmp3 279} 280 281define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 282;CHECK-LABEL: sabd_4h: 283;CHECK: sabd.4h 284 %tmp1 = load <4 x i16>, <4 x i16>* %A 285 %tmp2 = load <4 x i16>, <4 x i16>* %B 286 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 287 ret <4 x i16> %tmp3 288} 289 290define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 291;CHECK-LABEL: sabd_8h: 292;CHECK: sabd.8h 293 %tmp1 = load <8 x i16>, <8 x i16>* %A 294 %tmp2 = load <8 x i16>, <8 x i16>* %B 295 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 296 ret <8 x i16> %tmp3 297} 298 299define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 300;CHECK-LABEL: sabd_2s: 301;CHECK: sabd.2s 302 %tmp1 = load <2 x i32>, <2 x i32>* %A 303 %tmp2 = load <2 x i32>, <2 x i32>* %B 304 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 305 ret <2 x i32> %tmp3 306} 307 308define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 309;CHECK-LABEL: sabd_4s: 310;CHECK: sabd.4s 311 %tmp1 = load <4 x i32>, <4 x i32>* %A 312 %tmp2 = load <4 x i32>, <4 x i32>* %B 313 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 314 ret <4 x i32> %tmp3 315} 316 317declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 318declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 319declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 320declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 321declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 322declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 323 324define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 325;CHECK-LABEL: uabd_8b: 326;CHECK: uabd.8b 327 %tmp1 = load <8 x i8>, <8 x i8>* %A 328 %tmp2 = load <8 x i8>, <8 x i8>* %B 329 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 330 ret <8 x i8> %tmp3 331} 332 333define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 334;CHECK-LABEL: uabd_16b: 335;CHECK: uabd.16b 336 %tmp1 = load <16 x i8>, <16 x i8>* %A 337 %tmp2 = load <16 x i8>, <16 x i8>* %B 338 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 339 ret <16 x i8> %tmp3 340} 341 342define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 343;CHECK-LABEL: uabd_4h: 344;CHECK: uabd.4h 345 %tmp1 = load <4 x i16>, <4 x i16>* %A 346 %tmp2 = load <4 x i16>, <4 x i16>* %B 347 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 348 ret <4 x i16> %tmp3 349} 350 351define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 352;CHECK-LABEL: uabd_8h: 353;CHECK: uabd.8h 354 %tmp1 = load <8 x i16>, <8 x i16>* %A 355 %tmp2 = load <8 x i16>, <8 x i16>* %B 356 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 357 ret <8 x i16> %tmp3 358} 359 360define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 361;CHECK-LABEL: uabd_2s: 362;CHECK: uabd.2s 363 %tmp1 = load <2 x i32>, <2 x i32>* %A 364 %tmp2 = load <2 x i32>, <2 x i32>* %B 365 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 366 ret <2 x i32> %tmp3 367} 368 369define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 370;CHECK-LABEL: uabd_4s: 371;CHECK: uabd.4s 372 %tmp1 = load <4 x i32>, <4 x i32>* %A 373 %tmp2 = load <4 x i32>, <4 x i32>* %B 374 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 375 ret <4 x i32> %tmp3 376} 377 378declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 379declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 380declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 381declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 382declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 383declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 384 385define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 386;CHECK-LABEL: sqabs_8b: 387;CHECK: sqabs.8b 388 %tmp1 = load <8 x i8>, <8 x i8>* %A 389 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 390 ret <8 x i8> %tmp3 391} 392 393define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 394;CHECK-LABEL: sqabs_16b: 395;CHECK: sqabs.16b 396 %tmp1 = load <16 x i8>, <16 x i8>* %A 397 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 398 ret <16 x i8> %tmp3 399} 400 401define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 402;CHECK-LABEL: sqabs_4h: 403;CHECK: sqabs.4h 404 %tmp1 = load <4 x i16>, <4 x i16>* %A 405 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 406 ret <4 x i16> %tmp3 407} 408 409define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 410;CHECK-LABEL: sqabs_8h: 411;CHECK: sqabs.8h 412 %tmp1 = load <8 x i16>, <8 x i16>* %A 413 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 414 ret <8 x i16> %tmp3 415} 416 417define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 418;CHECK-LABEL: sqabs_2s: 419;CHECK: sqabs.2s 420 %tmp1 = load <2 x i32>, <2 x i32>* %A 421 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 422 ret <2 x i32> %tmp3 423} 424 425define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 426;CHECK-LABEL: sqabs_4s: 427;CHECK: sqabs.4s 428 %tmp1 = load <4 x i32>, <4 x i32>* %A 429 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 430 ret <4 x i32> %tmp3 431} 432 433declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 434declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 435declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 436declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 437declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 438declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 439 440define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 441;CHECK-LABEL: sqneg_8b: 442;CHECK: sqneg.8b 443 %tmp1 = load <8 x i8>, <8 x i8>* %A 444 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 445 ret <8 x i8> %tmp3 446} 447 448define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 449;CHECK-LABEL: sqneg_16b: 450;CHECK: sqneg.16b 451 %tmp1 = load <16 x i8>, <16 x i8>* %A 452 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 453 ret <16 x i8> %tmp3 454} 455 456define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 457;CHECK-LABEL: sqneg_4h: 458;CHECK: sqneg.4h 459 %tmp1 = load <4 x i16>, <4 x i16>* %A 460 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 461 ret <4 x i16> %tmp3 462} 463 464define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 465;CHECK-LABEL: sqneg_8h: 466;CHECK: sqneg.8h 467 %tmp1 = load <8 x i16>, <8 x i16>* %A 468 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 469 ret <8 x i16> %tmp3 470} 471 472define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 473;CHECK-LABEL: sqneg_2s: 474;CHECK: sqneg.2s 475 %tmp1 = load <2 x i32>, <2 x i32>* %A 476 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 477 ret <2 x i32> %tmp3 478} 479 480define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 481;CHECK-LABEL: sqneg_4s: 482;CHECK: sqneg.4s 483 %tmp1 = load <4 x i32>, <4 x i32>* %A 484 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 485 ret <4 x i32> %tmp3 486} 487 488declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 489declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 490declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 491declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 492declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 493declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 494 495define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 496;CHECK-LABEL: abs_8b: 497;CHECK: abs.8b 498 %tmp1 = load <8 x i8>, <8 x i8>* %A 499 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 500 ret <8 x i8> %tmp3 501} 502 503define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 504;CHECK-LABEL: abs_16b: 505;CHECK: abs.16b 506 %tmp1 = load <16 x i8>, <16 x i8>* %A 507 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 508 ret <16 x i8> %tmp3 509} 510 511define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 512;CHECK-LABEL: abs_4h: 513;CHECK: abs.4h 514 %tmp1 = load <4 x i16>, <4 x i16>* %A 515 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 516 ret <4 x i16> %tmp3 517} 518 519define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 520;CHECK-LABEL: abs_8h: 521;CHECK: abs.8h 522 %tmp1 = load <8 x i16>, <8 x i16>* %A 523 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 524 ret <8 x i16> %tmp3 525} 526 527define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 528;CHECK-LABEL: abs_2s: 529;CHECK: abs.2s 530 %tmp1 = load <2 x i32>, <2 x i32>* %A 531 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 532 ret <2 x i32> %tmp3 533} 534 535define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 536;CHECK-LABEL: abs_4s: 537;CHECK: abs.4s 538 %tmp1 = load <4 x i32>, <4 x i32>* %A 539 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 540 ret <4 x i32> %tmp3 541} 542 543define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 544; CHECK-LABEL: abs_1d: 545; CHECK: abs d0, d0 546 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 547 ret <1 x i64> %abs 548} 549 550define i64 @abs_1d_honestly(i64 %A) nounwind { 551; CHECK-LABEL: abs_1d_honestly: 552; CHECK: abs d0, d0 553 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 554 ret i64 %abs 555} 556 557declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 558declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 559declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 560declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 561declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 562declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 563declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 564declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 565 566; FALLBACK-NOT: remark:{{.*}} sabal8h 567define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 568;CHECK-LABEL: sabal8h: 569;CHECK: sabal.8h 570 %tmp1 = load <8 x i8>, <8 x i8>* %A 571 %tmp2 = load <8 x i8>, <8 x i8>* %B 572 %tmp3 = load <8 x i16>, <8 x i16>* %C 573 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 574 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 575 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 576 ret <8 x i16> %tmp5 577} 578 579; FALLBACK-NOT: remark:{{.*}} sabal4s 580define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 581;CHECK-LABEL: sabal4s: 582;CHECK: sabal.4s 583 %tmp1 = load <4 x i16>, <4 x i16>* %A 584 %tmp2 = load <4 x i16>, <4 x i16>* %B 585 %tmp3 = load <4 x i32>, <4 x i32>* %C 586 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 587 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 588 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 589 ret <4 x i32> %tmp5 590} 591 592; FALLBACK-NOT: remark:{{.*}} sabal2d 593define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 594;CHECK-LABEL: sabal2d: 595;CHECK: sabal.2d 596 %tmp1 = load <2 x i32>, <2 x i32>* %A 597 %tmp2 = load <2 x i32>, <2 x i32>* %B 598 %tmp3 = load <2 x i64>, <2 x i64>* %C 599 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 600 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 601 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 602 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 603 ret <2 x i64> %tmp5 604} 605 606define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 607;CHECK-LABEL: sabal2_8h: 608;CHECK: sabal.8h 609 %load1 = load <16 x i8>, <16 x i8>* %A 610 %load2 = load <16 x i8>, <16 x i8>* %B 611 %tmp3 = load <8 x i16>, <8 x i16>* %C 612 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 613 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 614 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 615 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 616 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 617 ret <8 x i16> %tmp5 618} 619 620define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 621;CHECK-LABEL: sabal2_4s: 622;CHECK: sabal.4s 623 %load1 = load <8 x i16>, <8 x i16>* %A 624 %load2 = load <8 x i16>, <8 x i16>* %B 625 %tmp3 = load <4 x i32>, <4 x i32>* %C 626 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 627 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 628 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 629 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 630 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 631 ret <4 x i32> %tmp5 632} 633 634define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 635;CHECK-LABEL: sabal2_2d: 636;CHECK: sabal.2d 637 %load1 = load <4 x i32>, <4 x i32>* %A 638 %load2 = load <4 x i32>, <4 x i32>* %B 639 %tmp3 = load <2 x i64>, <2 x i64>* %C 640 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 641 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 642 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 643 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 644 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 645 ret <2 x i64> %tmp5 646} 647 648; FALLBACK-NOT: remark:{{.*}} uabal8h 649define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 650;CHECK-LABEL: uabal8h: 651;CHECK: uabal.8h 652 %tmp1 = load <8 x i8>, <8 x i8>* %A 653 %tmp2 = load <8 x i8>, <8 x i8>* %B 654 %tmp3 = load <8 x i16>, <8 x i16>* %C 655 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 656 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 657 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 658 ret <8 x i16> %tmp5 659} 660 661; FALLBACK-NOT: remark:{{.*}} uabal8s 662define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 663;CHECK-LABEL: uabal4s: 664;CHECK: uabal.4s 665 %tmp1 = load <4 x i16>, <4 x i16>* %A 666 %tmp2 = load <4 x i16>, <4 x i16>* %B 667 %tmp3 = load <4 x i32>, <4 x i32>* %C 668 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 669 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 670 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 671 ret <4 x i32> %tmp5 672} 673 674; FALLBACK-NOT: remark:{{.*}} uabal2d 675define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 676;CHECK-LABEL: uabal2d: 677;CHECK: uabal.2d 678 %tmp1 = load <2 x i32>, <2 x i32>* %A 679 %tmp2 = load <2 x i32>, <2 x i32>* %B 680 %tmp3 = load <2 x i64>, <2 x i64>* %C 681 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 682 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 683 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 684 ret <2 x i64> %tmp5 685} 686 687define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 688;CHECK-LABEL: uabal2_8h: 689;CHECK: uabal.8h 690 %load1 = load <16 x i8>, <16 x i8>* %A 691 %load2 = load <16 x i8>, <16 x i8>* %B 692 %tmp3 = load <8 x i16>, <8 x i16>* %C 693 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 694 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 695 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 696 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 697 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 698 ret <8 x i16> %tmp5 699} 700 701define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 702;CHECK-LABEL: uabal2_4s: 703;CHECK: uabal.4s 704 %load1 = load <8 x i16>, <8 x i16>* %A 705 %load2 = load <8 x i16>, <8 x i16>* %B 706 %tmp3 = load <4 x i32>, <4 x i32>* %C 707 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 708 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 709 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 710 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 711 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 712 ret <4 x i32> %tmp5 713} 714 715define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 716;CHECK-LABEL: uabal2_2d: 717;CHECK: uabal.2d 718 %load1 = load <4 x i32>, <4 x i32>* %A 719 %load2 = load <4 x i32>, <4 x i32>* %B 720 %tmp3 = load <2 x i64>, <2 x i64>* %C 721 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 722 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 723 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 724 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 725 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 726 ret <2 x i64> %tmp5 727} 728 729define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 730;CHECK-LABEL: saba_8b: 731;CHECK: saba.8b 732 %tmp1 = load <8 x i8>, <8 x i8>* %A 733 %tmp2 = load <8 x i8>, <8 x i8>* %B 734 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 735 %tmp4 = load <8 x i8>, <8 x i8>* %C 736 %tmp5 = add <8 x i8> %tmp3, %tmp4 737 ret <8 x i8> %tmp5 738} 739 740define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 741;CHECK-LABEL: saba_16b: 742;CHECK: saba.16b 743 %tmp1 = load <16 x i8>, <16 x i8>* %A 744 %tmp2 = load <16 x i8>, <16 x i8>* %B 745 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 746 %tmp4 = load <16 x i8>, <16 x i8>* %C 747 %tmp5 = add <16 x i8> %tmp3, %tmp4 748 ret <16 x i8> %tmp5 749} 750 751define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 752;CHECK-LABEL: saba_4h: 753;CHECK: saba.4h 754 %tmp1 = load <4 x i16>, <4 x i16>* %A 755 %tmp2 = load <4 x i16>, <4 x i16>* %B 756 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 757 %tmp4 = load <4 x i16>, <4 x i16>* %C 758 %tmp5 = add <4 x i16> %tmp3, %tmp4 759 ret <4 x i16> %tmp5 760} 761 762define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 763;CHECK-LABEL: saba_8h: 764;CHECK: saba.8h 765 %tmp1 = load <8 x i16>, <8 x i16>* %A 766 %tmp2 = load <8 x i16>, <8 x i16>* %B 767 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 768 %tmp4 = load <8 x i16>, <8 x i16>* %C 769 %tmp5 = add <8 x i16> %tmp3, %tmp4 770 ret <8 x i16> %tmp5 771} 772 773define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 774;CHECK-LABEL: saba_2s: 775;CHECK: saba.2s 776 %tmp1 = load <2 x i32>, <2 x i32>* %A 777 %tmp2 = load <2 x i32>, <2 x i32>* %B 778 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 779 %tmp4 = load <2 x i32>, <2 x i32>* %C 780 %tmp5 = add <2 x i32> %tmp3, %tmp4 781 ret <2 x i32> %tmp5 782} 783 784define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 785;CHECK-LABEL: saba_4s: 786;CHECK: saba.4s 787 %tmp1 = load <4 x i32>, <4 x i32>* %A 788 %tmp2 = load <4 x i32>, <4 x i32>* %B 789 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 790 %tmp4 = load <4 x i32>, <4 x i32>* %C 791 %tmp5 = add <4 x i32> %tmp3, %tmp4 792 ret <4 x i32> %tmp5 793} 794 795define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 796;CHECK-LABEL: uaba_8b: 797;CHECK: uaba.8b 798 %tmp1 = load <8 x i8>, <8 x i8>* %A 799 %tmp2 = load <8 x i8>, <8 x i8>* %B 800 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 801 %tmp4 = load <8 x i8>, <8 x i8>* %C 802 %tmp5 = add <8 x i8> %tmp3, %tmp4 803 ret <8 x i8> %tmp5 804} 805 806define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 807;CHECK-LABEL: uaba_16b: 808;CHECK: uaba.16b 809 %tmp1 = load <16 x i8>, <16 x i8>* %A 810 %tmp2 = load <16 x i8>, <16 x i8>* %B 811 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 812 %tmp4 = load <16 x i8>, <16 x i8>* %C 813 %tmp5 = add <16 x i8> %tmp3, %tmp4 814 ret <16 x i8> %tmp5 815} 816 817define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 818;CHECK-LABEL: uaba_4h: 819;CHECK: uaba.4h 820 %tmp1 = load <4 x i16>, <4 x i16>* %A 821 %tmp2 = load <4 x i16>, <4 x i16>* %B 822 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 823 %tmp4 = load <4 x i16>, <4 x i16>* %C 824 %tmp5 = add <4 x i16> %tmp3, %tmp4 825 ret <4 x i16> %tmp5 826} 827 828define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 829;CHECK-LABEL: uaba_8h: 830;CHECK: uaba.8h 831 %tmp1 = load <8 x i16>, <8 x i16>* %A 832 %tmp2 = load <8 x i16>, <8 x i16>* %B 833 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 834 %tmp4 = load <8 x i16>, <8 x i16>* %C 835 %tmp5 = add <8 x i16> %tmp3, %tmp4 836 ret <8 x i16> %tmp5 837} 838 839define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 840;CHECK-LABEL: uaba_2s: 841;CHECK: uaba.2s 842 %tmp1 = load <2 x i32>, <2 x i32>* %A 843 %tmp2 = load <2 x i32>, <2 x i32>* %B 844 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 845 %tmp4 = load <2 x i32>, <2 x i32>* %C 846 %tmp5 = add <2 x i32> %tmp3, %tmp4 847 ret <2 x i32> %tmp5 848} 849 850define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 851;CHECK-LABEL: uaba_4s: 852;CHECK: uaba.4s 853 %tmp1 = load <4 x i32>, <4 x i32>* %A 854 %tmp2 = load <4 x i32>, <4 x i32>* %B 855 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 856 %tmp4 = load <4 x i32>, <4 x i32>* %C 857 %tmp5 = add <4 x i32> %tmp3, %tmp4 858 ret <4 x i32> %tmp5 859} 860 861; Scalar FABD 862define float @fabds(float %a, float %b) nounwind { 863; CHECK-LABEL: fabds: 864; CHECK: fabd s0, s0, s1 865 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 866 ret float %vabd.i 867} 868 869define double @fabdd(double %a, double %b) nounwind { 870; CHECK-LABEL: fabdd: 871; CHECK: fabd d0, d0, d1 872 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 873 ret double %vabd.i 874} 875 876declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 877declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 878 879define float @fabds_from_fsub_fabs(float %a, float %b) nounwind { 880; CHECK-LABEL: fabds_from_fsub_fabs: 881; CHECK: fabd s0, s0, s1 882 %sub = fsub float %a, %b 883 %abs = tail call float @llvm.fabs.f32(float %sub) 884 ret float %abs 885} 886 887define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind { 888; CHECK-LABEL: fabdd_from_fsub_fabs: 889; CHECK: fabd d0, d0, d1 890 %sub = fsub double %a, %b 891 %abs = tail call double @llvm.fabs.f64(double %sub) 892 ret double %abs 893} 894 895declare float @llvm.fabs.f32(float) nounwind readnone 896declare double @llvm.fabs.f64(double) nounwind readnone 897 898define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 899; CHECK-LABEL: uabdl_from_extract_dup: 900; CHECK-NOT: ext.16b 901; CHECK: uabdl.2d 902 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 903 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 904 905 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 906 907 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 908 %res1 = zext <2 x i32> %res to <2 x i64> 909 ret <2 x i64> %res1 910} 911 912define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 913; CHECK-LABEL: uabdl2_from_extract_dup: 914; CHECK-NOT: ext.16b 915; CHECK: uabdl2.2d 916 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 917 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 918 919 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 920 921 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 922 %res1 = zext <2 x i32> %res to <2 x i64> 923 ret <2 x i64> %res1 924} 925 926define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 927; CHECK-LABEL: sabdl_from_extract_dup: 928; CHECK-NOT: ext.16b 929; CHECK: sabdl.2d 930 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 931 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 932 933 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 934 935 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 936 %res1 = zext <2 x i32> %res to <2 x i64> 937 ret <2 x i64> %res1 938} 939 940define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 941; CHECK-LABEL: sabdl2_from_extract_dup: 942; CHECK-NOT: ext.16b 943; CHECK: sabdl2.2d 944 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 945 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 946 947 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 948 949 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 950 %res1 = zext <2 x i32> %res to <2 x i64> 951 ret <2 x i64> %res1 952} 953 954define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 955; CHECK-LABEL: abspattern1: 956; DAG: abs.2s 957; DAG-NEXT: ret 958 959; GISEL: cmge.2s 960; GISEL: sub.2s 961; GISEL: fcsel 962; GISEL: fcsel 963 %tmp1neg = sub <2 x i32> zeroinitializer, %a 964 %b = icmp sge <2 x i32> %a, zeroinitializer 965 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 966 ret <2 x i32> %abs 967} 968 969define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 970; CHECK-LABEL: abspattern2: 971; CHECK: abs.4h 972; CHECK-NEXT: ret 973 %tmp1neg = sub <4 x i16> zeroinitializer, %a 974 %b = icmp sgt <4 x i16> %a, zeroinitializer 975 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 976 ret <4 x i16> %abs 977} 978 979define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 980; CHECK-LABEL: abspattern3: 981; CHECK: abs.8b 982; CHECK-NEXT: ret 983 %tmp1neg = sub <8 x i8> zeroinitializer, %a 984 %b = icmp slt <8 x i8> %a, zeroinitializer 985 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 986 ret <8 x i8> %abs 987} 988 989define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 990; CHECK-LABEL: abspattern4: 991; DAG: abs.4s 992; DAG-NEXT: ret 993 994; GISEL: cmge.4s 995; GISEL: fcsel 996; GISEL: fcsel 997; GISEL: fcsel 998; GISEL: fcsel 999 %tmp1neg = sub <4 x i32> zeroinitializer, %a 1000 %b = icmp sge <4 x i32> %a, zeroinitializer 1001 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 1002 ret <4 x i32> %abs 1003} 1004 1005define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 1006; CHECK-LABEL: abspattern5: 1007; DAG: abs.8h 1008; DAG-NEXT: ret 1009 1010; GISEL: cmgt.8h 1011; GISEL: sub.8h 1012; GISEL: csel 1013; GISEL: csel 1014; GISEL: csel 1015; GISEL: csel 1016; GISEL: csel 1017; GISEL: csel 1018; GISEL: csel 1019; GISEL: csel 1020 %tmp1neg = sub <8 x i16> zeroinitializer, %a 1021 %b = icmp sgt <8 x i16> %a, zeroinitializer 1022 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 1023 ret <8 x i16> %abs 1024} 1025 1026define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 1027; CHECK-LABEL: abspattern6: 1028; CHECK: abs.16b 1029; CHECK-NEXT: ret 1030 %tmp1neg = sub <16 x i8> zeroinitializer, %a 1031 %b = icmp slt <16 x i8> %a, zeroinitializer 1032 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 1033 ret <16 x i8> %abs 1034} 1035 1036define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 1037; CHECK-LABEL: abspattern7: 1038; DAG: abs.2d 1039; DAG-NEXT: ret 1040 1041; GISEL: cmge.2d 1042; GISEL: sub.2d 1043; GISEL: fcsel 1044; GISEL: fcsel 1045 %tmp1neg = sub <2 x i64> zeroinitializer, %a 1046 %b = icmp sle <2 x i64> %a, zeroinitializer 1047 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 1048 ret <2 x i64> %abs 1049} 1050