1; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 4; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK 17 18; VBYTES represents the useful byte size of a vector register from the code 19; generator's point of view. It is clamped to power-of-2 values because 20; only power-of-2 vector lengths are considered legal, regardless of the 21; user specified vector length. 22 23target triple = "aarch64-unknown-linux-gnu" 24 25; Don't use SVE when its registers are no bigger than NEON. 26; NO_SVE-NOT: ptrue 27 28; 29; ADD 30; 31 32; Don't use SVE for 64-bit vectors. 33define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 34; CHECK-LABEL: add_v8i8: 35; CHECK: add v0.8b, v0.8b, v1.8b 36; CHECK: ret 37 %res = add <8 x i8> %op1, %op2 38 ret <8 x i8> %res 39} 40 41; Don't use SVE for 128-bit vectors. 42define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 43; CHECK-LABEL: add_v16i8: 44; CHECK: add v0.16b, v0.16b, v1.16b 45; CHECK: ret 46 %res = add <16 x i8> %op1, %op2 47 ret <16 x i8> %res 48} 49 50define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 51; CHECK-LABEL: add_v32i8: 52; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] 53; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 54; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 55; CHECK: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 56; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 57; CHECK: ret 58 %op1 = load <32 x i8>, <32 x i8>* %a 59 %op2 = load <32 x i8>, <32 x i8>* %b 60 %res = add <32 x i8> %op1, %op2 61 store <32 x i8> %res, <32 x i8>* %a 62 ret void 63} 64 65define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 66; CHECK-LABEL: add_v64i8: 67; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] 68; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 69; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 70; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 71; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] 72; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] 73; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] 74; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] 75; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b 76; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] 77; CHECK: ret 78 %op1 = load <64 x i8>, <64 x i8>* %a 79 %op2 = load <64 x i8>, <64 x i8>* %b 80 %res = add <64 x i8> %op1, %op2 81 store <64 x i8> %res, <64 x i8>* %a 82 ret void 83} 84 85define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 86; CHECK-LABEL: add_v128i8: 87; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] 88; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 89; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 90; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 91; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] 92; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] 93; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] 94; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] 95; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b 96; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] 97; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]] 98; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]] 99; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]] 100; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b 101; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]] 102; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]] 103; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]] 104; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]] 105; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b 106; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]] 107; CHECK: ret 108 %op1 = load <128 x i8>, <128 x i8>* %a 109 %op2 = load <128 x i8>, <128 x i8>* %b 110 %res = add <128 x i8> %op1, %op2 111 store <128 x i8> %res, <128 x i8>* %a 112 ret void 113} 114 115define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 116; CHECK-LABEL: add_v256i8: 117; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] 118; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 119; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 120; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 121; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] 122; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] 123; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] 124; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] 125; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b 126; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] 127; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]] 128; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]] 129; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]] 130; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b 131; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]] 132; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]] 133; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]] 134; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]] 135; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b 136; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]] 137; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]] 138; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]] 139; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]] 140; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[PG]]/m, [[OP1_4]].b, [[OP2_4]].b 141; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]] 142; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]] 143; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]] 144; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]] 145; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[PG]]/m, [[OP1_5]].b, [[OP2_5]].b 146; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]] 147; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]] 148; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]] 149; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]] 150; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[PG]]/m, [[OP1_6]].b, [[OP2_6]].b 151; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]] 152; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]] 153; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]] 154; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]] 155; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[PG]]/m, [[OP1_7]].b, [[OP2_7]].b 156; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]] 157; CHECK: ret 158 %op1 = load <256 x i8>, <256 x i8>* %a 159 %op2 = load <256 x i8>, <256 x i8>* %b 160 %res = add <256 x i8> %op1, %op2 161 store <256 x i8> %res, <256 x i8>* %a 162 ret void 163} 164 165; Don't use SVE for 64-bit vectors. 166define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 167; CHECK-LABEL: add_v4i16: 168; CHECK: add v0.4h, v0.4h, v1.4h 169; CHECK: ret 170 %res = add <4 x i16> %op1, %op2 171 ret <4 x i16> %res 172} 173 174; Don't use SVE for 128-bit vectors. 175define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 176; CHECK-LABEL: add_v8i16: 177; CHECK: add v0.8h, v0.8h, v1.8h 178; CHECK: ret 179 %res = add <8 x i16> %op1, %op2 180 ret <8 x i16> %res 181} 182 183define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 184; CHECK-LABEL: add_v16i16: 185; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 186; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 187; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 188; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 189; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 190; CHECK: ret 191 %op1 = load <16 x i16>, <16 x i16>* %a 192 %op2 = load <16 x i16>, <16 x i16>* %b 193 %res = add <16 x i16> %op1, %op2 194 store <16 x i16> %res, <16 x i16>* %a 195 ret void 196} 197 198; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 199; already cover the general legalisation cases. 200define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 201; CHECK-LABEL: add_v32i16: 202; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 203; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 204; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 205; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 206; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 207; CHECK: ret 208 %op1 = load <32 x i16>, <32 x i16>* %a 209 %op2 = load <32 x i16>, <32 x i16>* %b 210 %res = add <32 x i16> %op1, %op2 211 store <32 x i16> %res, <32 x i16>* %a 212 ret void 213} 214 215; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 216; already cover the general legalisation cases. 217define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 218; CHECK-LABEL: add_v64i16: 219; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 220; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 221; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 222; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 223; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 224; CHECK: ret 225 %op1 = load <64 x i16>, <64 x i16>* %a 226 %op2 = load <64 x i16>, <64 x i16>* %b 227 %res = add <64 x i16> %op1, %op2 228 store <64 x i16> %res, <64 x i16>* %a 229 ret void 230} 231 232; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 233; already cover the general legalisation cases. 234define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 235; CHECK-LABEL: add_v128i16: 236; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 237; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 238; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 239; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 240; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 241; CHECK: ret 242 %op1 = load <128 x i16>, <128 x i16>* %a 243 %op2 = load <128 x i16>, <128 x i16>* %b 244 %res = add <128 x i16> %op1, %op2 245 store <128 x i16> %res, <128 x i16>* %a 246 ret void 247} 248 249; Don't use SVE for 64-bit vectors. 250define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 251; CHECK-LABEL: add_v2i32: 252; CHECK: add v0.2s, v0.2s, v1.2s 253; CHECK: ret 254 %res = add <2 x i32> %op1, %op2 255 ret <2 x i32> %res 256} 257 258; Don't use SVE for 128-bit vectors. 259define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 260; CHECK-LABEL: add_v4i32: 261; CHECK: add v0.4s, v0.4s, v1.4s 262; CHECK: ret 263 %res = add <4 x i32> %op1, %op2 264 ret <4 x i32> %res 265} 266 267define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 268; CHECK-LABEL: add_v8i32: 269; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 270; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 271; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 272; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 273; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 274; CHECK: ret 275 %op1 = load <8 x i32>, <8 x i32>* %a 276 %op2 = load <8 x i32>, <8 x i32>* %b 277 %res = add <8 x i32> %op1, %op2 278 store <8 x i32> %res, <8 x i32>* %a 279 ret void 280} 281 282; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 283; already cover the general legalisation cases. 284define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 285; CHECK-LABEL: add_v16i32: 286; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 287; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 288; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 289; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 290; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 291; CHECK: ret 292 %op1 = load <16 x i32>, <16 x i32>* %a 293 %op2 = load <16 x i32>, <16 x i32>* %b 294 %res = add <16 x i32> %op1, %op2 295 store <16 x i32> %res, <16 x i32>* %a 296 ret void 297} 298 299; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 300; already cover the general legalisation cases. 301define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 302; CHECK-LABEL: add_v32i32: 303; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 304; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 305; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 306; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 307; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 308; CHECK: ret 309 %op1 = load <32 x i32>, <32 x i32>* %a 310 %op2 = load <32 x i32>, <32 x i32>* %b 311 %res = add <32 x i32> %op1, %op2 312 store <32 x i32> %res, <32 x i32>* %a 313 ret void 314} 315 316; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 317; already cover the general legalisation cases. 318define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 319; CHECK-LABEL: add_v64i32: 320; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 321; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 322; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 323; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 324; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 325; CHECK: ret 326 %op1 = load <64 x i32>, <64 x i32>* %a 327 %op2 = load <64 x i32>, <64 x i32>* %b 328 %res = add <64 x i32> %op1, %op2 329 store <64 x i32> %res, <64 x i32>* %a 330 ret void 331} 332 333; Don't use SVE for 64-bit vectors. 334define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 335; CHECK-LABEL: add_v1i64: 336; CHECK: add d0, d0, d1 337; CHECK: ret 338 %res = add <1 x i64> %op1, %op2 339 ret <1 x i64> %res 340} 341 342; Don't use SVE for 128-bit vectors. 343define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 344; CHECK-LABEL: add_v2i64: 345; CHECK: add v0.2d, v0.2d, v1.2d 346; CHECK: ret 347 %res = add <2 x i64> %op1, %op2 348 ret <2 x i64> %res 349} 350 351define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 352; CHECK-LABEL: add_v4i64: 353; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 354; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 355; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 356; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 357; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 358; CHECK: ret 359 %op1 = load <4 x i64>, <4 x i64>* %a 360 %op2 = load <4 x i64>, <4 x i64>* %b 361 %res = add <4 x i64> %op1, %op2 362 store <4 x i64> %res, <4 x i64>* %a 363 ret void 364} 365 366; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 367; already cover the general legalisation cases. 368define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 369; CHECK-LABEL: add_v8i64: 370; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 371; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 372; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 373; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 374; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 375; CHECK: ret 376 %op1 = load <8 x i64>, <8 x i64>* %a 377 %op2 = load <8 x i64>, <8 x i64>* %b 378 %res = add <8 x i64> %op1, %op2 379 store <8 x i64> %res, <8 x i64>* %a 380 ret void 381} 382 383; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 384; already cover the general legalisation cases. 385define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 386; CHECK-LABEL: add_v16i64: 387; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 388; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 389; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 390; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 391; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 392; CHECK: ret 393 %op1 = load <16 x i64>, <16 x i64>* %a 394 %op2 = load <16 x i64>, <16 x i64>* %b 395 %res = add <16 x i64> %op1, %op2 396 store <16 x i64> %res, <16 x i64>* %a 397 ret void 398} 399 400; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests 401; already cover the general legalisation cases. 402define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 403; CHECK-LABEL: add_v32i64: 404; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 405; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 406; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 407; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 408; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 409; CHECK: ret 410 %op1 = load <32 x i64>, <32 x i64>* %a 411 %op2 = load <32 x i64>, <32 x i64>* %b 412 %res = add <32 x i64> %op1, %op2 413 store <32 x i64> %res, <32 x i64>* %a 414 ret void 415} 416 417; 418; NOTE: Tests beyond this point only have CHECK lines to validate the first 419; VBYTES because the add tests already validate the legalisation code paths. 420; 421 422; 423; MUL 424; 425 426; Don't use SVE for 64-bit vectors. 427define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 428; CHECK-LABEL: mul_v8i8: 429; CHECK: mul v0.8b, v0.8b, v1.8b 430; CHECK: ret 431 %res = mul <8 x i8> %op1, %op2 432 ret <8 x i8> %res 433} 434 435; Don't use SVE for 128-bit vectors. 436define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 437; CHECK-LABEL: mul_v16i8: 438; CHECK: mul v0.16b, v0.16b, v1.16b 439; CHECK: ret 440 %res = mul <16 x i8> %op1, %op2 441 ret <16 x i8> %res 442} 443 444define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 445; CHECK-LABEL: mul_v32i8: 446; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] 447; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 448; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 449; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 450; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 451; CHECK: ret 452 %op1 = load <32 x i8>, <32 x i8>* %a 453 %op2 = load <32 x i8>, <32 x i8>* %b 454 %res = mul <32 x i8> %op1, %op2 455 store <32 x i8> %res, <32 x i8>* %a 456 ret void 457} 458 459define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 460; CHECK-LABEL: mul_v64i8: 461; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] 462; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 463; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 464; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 465; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 466; CHECK: ret 467 %op1 = load <64 x i8>, <64 x i8>* %a 468 %op2 = load <64 x i8>, <64 x i8>* %b 469 %res = mul <64 x i8> %op1, %op2 470 store <64 x i8> %res, <64 x i8>* %a 471 ret void 472} 473 474define void @mul_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 475; CHECK-LABEL: mul_v128i8: 476; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] 477; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 478; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 479; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 480; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 481; CHECK: ret 482 %op1 = load <128 x i8>, <128 x i8>* %a 483 %op2 = load <128 x i8>, <128 x i8>* %b 484 %res = mul <128 x i8> %op1, %op2 485 store <128 x i8> %res, <128 x i8>* %a 486 ret void 487} 488 489define void @mul_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 490; CHECK-LABEL: mul_v256i8: 491; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] 492; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 493; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 494; CHECK: mul [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 495; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 496; CHECK: ret 497 %op1 = load <256 x i8>, <256 x i8>* %a 498 %op2 = load <256 x i8>, <256 x i8>* %b 499 %res = mul <256 x i8> %op1, %op2 500 store <256 x i8> %res, <256 x i8>* %a 501 ret void 502} 503 504; Don't use SVE for 64-bit vectors. 505define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 506; CHECK-LABEL: mul_v4i16: 507; CHECK: mul v0.4h, v0.4h, v1.4h 508; CHECK: ret 509 %res = mul <4 x i16> %op1, %op2 510 ret <4 x i16> %res 511} 512 513; Don't use SVE for 128-bit vectors. 514define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 515; CHECK-LABEL: mul_v8i16: 516; CHECK: mul v0.8h, v0.8h, v1.8h 517; CHECK: ret 518 %res = mul <8 x i16> %op1, %op2 519 ret <8 x i16> %res 520} 521 522define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 523; CHECK-LABEL: mul_v16i16: 524; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 525; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 526; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 527; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 528; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 529; CHECK: ret 530 %op1 = load <16 x i16>, <16 x i16>* %a 531 %op2 = load <16 x i16>, <16 x i16>* %b 532 %res = mul <16 x i16> %op1, %op2 533 store <16 x i16> %res, <16 x i16>* %a 534 ret void 535} 536 537define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 538; CHECK-LABEL: mul_v32i16: 539; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 540; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 541; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 542; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 543; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 544; CHECK: ret 545 %op1 = load <32 x i16>, <32 x i16>* %a 546 %op2 = load <32 x i16>, <32 x i16>* %b 547 %res = mul <32 x i16> %op1, %op2 548 store <32 x i16> %res, <32 x i16>* %a 549 ret void 550} 551 552define void @mul_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 553; CHECK-LABEL: mul_v64i16: 554; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 555; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 556; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 557; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 558; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 559; CHECK: ret 560 %op1 = load <64 x i16>, <64 x i16>* %a 561 %op2 = load <64 x i16>, <64 x i16>* %b 562 %res = mul <64 x i16> %op1, %op2 563 store <64 x i16> %res, <64 x i16>* %a 564 ret void 565} 566 567define void @mul_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 568; CHECK-LABEL: mul_v128i16: 569; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 570; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 571; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 572; CHECK: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 573; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 574; CHECK: ret 575 %op1 = load <128 x i16>, <128 x i16>* %a 576 %op2 = load <128 x i16>, <128 x i16>* %b 577 %res = mul <128 x i16> %op1, %op2 578 store <128 x i16> %res, <128 x i16>* %a 579 ret void 580} 581 582; Don't use SVE for 64-bit vectors. 583define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 584; CHECK-LABEL: mul_v2i32: 585; CHECK: mul v0.2s, v0.2s, v1.2s 586; CHECK: ret 587 %res = mul <2 x i32> %op1, %op2 588 ret <2 x i32> %res 589} 590 591; Don't use SVE for 128-bit vectors. 592define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 593; CHECK-LABEL: mul_v4i32: 594; CHECK: mul v0.4s, v0.4s, v1.4s 595; CHECK: ret 596 %res = mul <4 x i32> %op1, %op2 597 ret <4 x i32> %res 598} 599 600define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 601; CHECK-LABEL: mul_v8i32: 602; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 603; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 604; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 605; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 606; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 607; CHECK: ret 608 %op1 = load <8 x i32>, <8 x i32>* %a 609 %op2 = load <8 x i32>, <8 x i32>* %b 610 %res = mul <8 x i32> %op1, %op2 611 store <8 x i32> %res, <8 x i32>* %a 612 ret void 613} 614 615define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 616; CHECK-LABEL: mul_v16i32: 617; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 618; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 619; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 620; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 621; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 622; CHECK: ret 623 %op1 = load <16 x i32>, <16 x i32>* %a 624 %op2 = load <16 x i32>, <16 x i32>* %b 625 %res = mul <16 x i32> %op1, %op2 626 store <16 x i32> %res, <16 x i32>* %a 627 ret void 628} 629 630define void @mul_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 631; CHECK-LABEL: mul_v32i32: 632; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 633; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 634; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 635; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 636; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 637; CHECK: ret 638 %op1 = load <32 x i32>, <32 x i32>* %a 639 %op2 = load <32 x i32>, <32 x i32>* %b 640 %res = mul <32 x i32> %op1, %op2 641 store <32 x i32> %res, <32 x i32>* %a 642 ret void 643} 644 645define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 646; CHECK-LABEL: mul_v64i32: 647; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 648; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 649; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 650; CHECK: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 651; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 652; CHECK: ret 653 %op1 = load <64 x i32>, <64 x i32>* %a 654 %op2 = load <64 x i32>, <64 x i32>* %b 655 %res = mul <64 x i32> %op1, %op2 656 store <64 x i32> %res, <64 x i32>* %a 657 ret void 658} 659 660; Vector i64 multiplications are not legal for NEON so use SVE when available. 661define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 662; CHECK-LABEL: mul_v1i64: 663; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 664; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d 665; CHECK: ret 666 %res = mul <1 x i64> %op1, %op2 667 ret <1 x i64> %res 668} 669 670; Vector i64 multiplications are not legal for NEON so use SVE when available. 671define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 672; CHECK-LABEL: mul_v2i64: 673; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 674; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d 675; CHECK: ret 676 %res = mul <2 x i64> %op1, %op2 677 ret <2 x i64> %res 678} 679 680define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 681; CHECK-LABEL: mul_v4i64: 682; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 683; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 684; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 685; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 686; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 687; CHECK: ret 688 %op1 = load <4 x i64>, <4 x i64>* %a 689 %op2 = load <4 x i64>, <4 x i64>* %b 690 %res = mul <4 x i64> %op1, %op2 691 store <4 x i64> %res, <4 x i64>* %a 692 ret void 693} 694 695define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 696; CHECK-LABEL: mul_v8i64: 697; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 698; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 699; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 700; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 701; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 702; CHECK: ret 703 %op1 = load <8 x i64>, <8 x i64>* %a 704 %op2 = load <8 x i64>, <8 x i64>* %b 705 %res = mul <8 x i64> %op1, %op2 706 store <8 x i64> %res, <8 x i64>* %a 707 ret void 708} 709 710define void @mul_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 711; CHECK-LABEL: mul_v16i64: 712; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 713; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 714; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 715; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 716; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 717; CHECK: ret 718 %op1 = load <16 x i64>, <16 x i64>* %a 719 %op2 = load <16 x i64>, <16 x i64>* %b 720 %res = mul <16 x i64> %op1, %op2 721 store <16 x i64> %res, <16 x i64>* %a 722 ret void 723} 724 725define void @mul_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 726; CHECK-LABEL: mul_v32i64: 727; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 728; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 729; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 730; CHECK: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 731; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 732; CHECK: ret 733 %op1 = load <32 x i64>, <32 x i64>* %a 734 %op2 = load <32 x i64>, <32 x i64>* %b 735 %res = mul <32 x i64> %op1, %op2 736 store <32 x i64> %res, <32 x i64>* %a 737 ret void 738} 739 740; 741; SUB 742; 743 744; Don't use SVE for 64-bit vectors. 745define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 746; CHECK-LABEL: sub_v8i8: 747; CHECK: sub v0.8b, v0.8b, v1.8b 748; CHECK: ret 749 %res = sub <8 x i8> %op1, %op2 750 ret <8 x i8> %res 751} 752 753; Don't use SVE for 128-bit vectors. 754define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 755; CHECK-LABEL: sub_v16i8: 756; CHECK: sub v0.16b, v0.16b, v1.16b 757; CHECK: ret 758 %res = sub <16 x i8> %op1, %op2 759 ret <16 x i8> %res 760} 761 762define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 763; CHECK-LABEL: sub_v32i8: 764; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] 765; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 766; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 767; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 768; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 769; CHECK: ret 770 %op1 = load <32 x i8>, <32 x i8>* %a 771 %op2 = load <32 x i8>, <32 x i8>* %b 772 %res = sub <32 x i8> %op1, %op2 773 store <32 x i8> %res, <32 x i8>* %a 774 ret void 775} 776 777define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 778; CHECK-LABEL: sub_v64i8: 779; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] 780; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 781; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 782; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 783; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 784; CHECK: ret 785 %op1 = load <64 x i8>, <64 x i8>* %a 786 %op2 = load <64 x i8>, <64 x i8>* %b 787 %res = sub <64 x i8> %op1, %op2 788 store <64 x i8> %res, <64 x i8>* %a 789 ret void 790} 791 792define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 793; CHECK-LABEL: sub_v128i8: 794; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] 795; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 796; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 797; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 798; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 799; CHECK: ret 800 %op1 = load <128 x i8>, <128 x i8>* %a 801 %op2 = load <128 x i8>, <128 x i8>* %b 802 %res = sub <128 x i8> %op1, %op2 803 store <128 x i8> %res, <128 x i8>* %a 804 ret void 805} 806 807define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 808; CHECK-LABEL: sub_v256i8: 809; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] 810; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 811; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 812; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 813; CHECK: st1b { [[RES]].b }, [[PG]], [x0] 814; CHECK: ret 815 %op1 = load <256 x i8>, <256 x i8>* %a 816 %op2 = load <256 x i8>, <256 x i8>* %b 817 %res = sub <256 x i8> %op1, %op2 818 store <256 x i8> %res, <256 x i8>* %a 819 ret void 820} 821 822; Don't use SVE for 64-bit vectors. 823define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 824; CHECK-LABEL: sub_v4i16: 825; CHECK: sub v0.4h, v0.4h, v1.4h 826; CHECK: ret 827 %res = sub <4 x i16> %op1, %op2 828 ret <4 x i16> %res 829} 830 831; Don't use SVE for 128-bit vectors. 832define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 833; CHECK-LABEL: sub_v8i16: 834; CHECK: sub v0.8h, v0.8h, v1.8h 835; CHECK: ret 836 %res = sub <8 x i16> %op1, %op2 837 ret <8 x i16> %res 838} 839 840define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 841; CHECK-LABEL: sub_v16i16: 842; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 843; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 844; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 845; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 846; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 847; CHECK: ret 848 %op1 = load <16 x i16>, <16 x i16>* %a 849 %op2 = load <16 x i16>, <16 x i16>* %b 850 %res = sub <16 x i16> %op1, %op2 851 store <16 x i16> %res, <16 x i16>* %a 852 ret void 853} 854 855define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 856; CHECK-LABEL: sub_v32i16: 857; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 858; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 859; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 860; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 861; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 862; CHECK: ret 863 %op1 = load <32 x i16>, <32 x i16>* %a 864 %op2 = load <32 x i16>, <32 x i16>* %b 865 %res = sub <32 x i16> %op1, %op2 866 store <32 x i16> %res, <32 x i16>* %a 867 ret void 868} 869 870define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 871; CHECK-LABEL: sub_v64i16: 872; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 873; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 874; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 875; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 876; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 877; CHECK: ret 878 %op1 = load <64 x i16>, <64 x i16>* %a 879 %op2 = load <64 x i16>, <64 x i16>* %b 880 %res = sub <64 x i16> %op1, %op2 881 store <64 x i16> %res, <64 x i16>* %a 882 ret void 883} 884 885define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 886; CHECK-LABEL: sub_v128i16: 887; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 888; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 889; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 890; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 891; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 892; CHECK: ret 893 %op1 = load <128 x i16>, <128 x i16>* %a 894 %op2 = load <128 x i16>, <128 x i16>* %b 895 %res = sub <128 x i16> %op1, %op2 896 store <128 x i16> %res, <128 x i16>* %a 897 ret void 898} 899 900; Don't use SVE for 64-bit vectors. 901define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 902; CHECK-LABEL: sub_v2i32: 903; CHECK: sub v0.2s, v0.2s, v1.2s 904; CHECK: ret 905 %res = sub <2 x i32> %op1, %op2 906 ret <2 x i32> %res 907} 908 909; Don't use SVE for 128-bit vectors. 910define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 911; CHECK-LABEL: sub_v4i32: 912; CHECK: sub v0.4s, v0.4s, v1.4s 913; CHECK: ret 914 %res = sub <4 x i32> %op1, %op2 915 ret <4 x i32> %res 916} 917 918define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 919; CHECK-LABEL: sub_v8i32: 920; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 921; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 922; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 923; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 924; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 925; CHECK: ret 926 %op1 = load <8 x i32>, <8 x i32>* %a 927 %op2 = load <8 x i32>, <8 x i32>* %b 928 %res = sub <8 x i32> %op1, %op2 929 store <8 x i32> %res, <8 x i32>* %a 930 ret void 931} 932 933define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 934; CHECK-LABEL: sub_v16i32: 935; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 936; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 937; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 938; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 939; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 940; CHECK: ret 941 %op1 = load <16 x i32>, <16 x i32>* %a 942 %op2 = load <16 x i32>, <16 x i32>* %b 943 %res = sub <16 x i32> %op1, %op2 944 store <16 x i32> %res, <16 x i32>* %a 945 ret void 946} 947 948define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 949; CHECK-LABEL: sub_v32i32: 950; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 951; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 952; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 953; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 954; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 955; CHECK: ret 956 %op1 = load <32 x i32>, <32 x i32>* %a 957 %op2 = load <32 x i32>, <32 x i32>* %b 958 %res = sub <32 x i32> %op1, %op2 959 store <32 x i32> %res, <32 x i32>* %a 960 ret void 961} 962 963define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 964; CHECK-LABEL: sub_v64i32: 965; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 966; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 967; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 968; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 969; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 970; CHECK: ret 971 %op1 = load <64 x i32>, <64 x i32>* %a 972 %op2 = load <64 x i32>, <64 x i32>* %b 973 %res = sub <64 x i32> %op1, %op2 974 store <64 x i32> %res, <64 x i32>* %a 975 ret void 976} 977 978; Don't use SVE for 64-bit vectors. 979define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 980; CHECK-LABEL: sub_v1i64: 981; CHECK: sub d0, d0, d1 982; CHECK: ret 983 %res = sub <1 x i64> %op1, %op2 984 ret <1 x i64> %res 985} 986 987; Don't use SVE for 128-bit vectors. 988define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 989; CHECK-LABEL: sub_v2i64: 990; CHECK: sub v0.2d, v0.2d, v1.2d 991; CHECK: ret 992 %res = sub <2 x i64> %op1, %op2 993 ret <2 x i64> %res 994} 995 996define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 997; CHECK-LABEL: sub_v4i64: 998; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 999; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1000; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1001; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1002; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 1003; CHECK: ret 1004 %op1 = load <4 x i64>, <4 x i64>* %a 1005 %op2 = load <4 x i64>, <4 x i64>* %b 1006 %res = sub <4 x i64> %op1, %op2 1007 store <4 x i64> %res, <4 x i64>* %a 1008 ret void 1009} 1010 1011define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 1012; CHECK-LABEL: sub_v8i64: 1013; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 1014; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1015; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1016; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1017; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 1018; CHECK: ret 1019 %op1 = load <8 x i64>, <8 x i64>* %a 1020 %op2 = load <8 x i64>, <8 x i64>* %b 1021 %res = sub <8 x i64> %op1, %op2 1022 store <8 x i64> %res, <8 x i64>* %a 1023 ret void 1024} 1025 1026define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 1027; CHECK-LABEL: sub_v16i64: 1028; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 1029; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1030; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1031; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1032; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 1033; CHECK: ret 1034 %op1 = load <16 x i64>, <16 x i64>* %a 1035 %op2 = load <16 x i64>, <16 x i64>* %b 1036 %res = sub <16 x i64> %op1, %op2 1037 store <16 x i64> %res, <16 x i64>* %a 1038 ret void 1039} 1040 1041define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 1042; CHECK-LABEL: sub_v32i64: 1043; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 1044; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1045; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1046; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1047; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 1048; CHECK: ret 1049 %op1 = load <32 x i64>, <32 x i64>* %a 1050 %op2 = load <32 x i64>, <32 x i64>* %b 1051 %res = sub <32 x i64> %op1, %op2 1052 store <32 x i64> %res, <32 x i64>* %a 1053 ret void 1054} 1055 1056attributes #0 = { "target-features"="+sve" } 1057