1; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 4; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK 17 18; VBYTES represents the useful byte size of a vector register from the code 19; generator's point of view. It is clamped to power-of-2 values because 20; only power-of-2 vector lengths are considered legal, regardless of the 21; user specified vector length. 22 23target triple = "aarch64-unknown-linux-gnu" 24 25; Don't use SVE when its registers are no bigger than NEON. 26; NO_SVE-NOT: ptrue 27 28; Don't use SVE for 64-bit vectors. 29define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { 30; CHECK-LABEL: @fadd_v4f16 31; CHECK: fadd v0.4h, v0.4h, v1.4h 32; CHECK: ret 33 %res = fadd <4 x half> %op1, %op2 34 ret <4 x half> %res 35} 36 37; Don't use SVE for 128-bit vectors. 38define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { 39; CHECK-LABEL: @fadd_v8f16 40; CHECK: fadd v0.8h, v0.8h, v1.8h 41; CHECK: ret 42 %res = fadd <8 x half> %op1, %op2 43 ret <8 x half> %res 44} 45 46define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { 47; CHECK-LABEL: @fadd_v16f16 48; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 49; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 50; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 51; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 52; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 53; CHECK: ret 54 %op1 = load <16 x half>, <16 x half>* %a 55 %op2 = load <16 x half>, <16 x half>* %b 56 %res = fadd <16 x half> %op1, %op2 57 store <16 x half> %res, <16 x half>* %a 58 ret void 59} 60 61define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { 62; CHECK-LABEL: @fadd_v32f16 63; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 64; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 65; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 66; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 67; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0] 68; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] 69; VBITS_LE_256-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]] 70; VBITS_LE_256-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]] 71; VBITS_LE_256-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]] 72; VBITS_LE_256-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h 73; VBITS_LE_256-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]] 74; CHECK: ret 75 %op1 = load <32 x half>, <32 x half>* %a 76 %op2 = load <32 x half>, <32 x half>* %b 77 %res = fadd <32 x half> %op1, %op2 78 store <32 x half> %res, <32 x half>* %a 79 ret void 80} 81 82define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { 83; CHECK-LABEL: @fadd_v64f16 84; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 85; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 86; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 87; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 88; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0] 89; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] 90; VBITS_LE_512-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]] 91; VBITS_LE_512-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]] 92; VBITS_LE_512-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]] 93; VBITS_LE_512-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h 94; VBITS_LE_512-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]] 95; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] 96; VBITS_LE_256-DAG: add x[[B2:[0-9]+]], x1, #[[#mul(VBYTES,2)]] 97; VBITS_LE_256-DAG: ld1h { [[OP1_2:z[0-9]+]].h }, [[PG]]/z, [x[[A2]]] 98; VBITS_LE_256-DAG: ld1h { [[OP2_2:z[0-9]+]].h }, [[PG]]/z, [x[[B2]]] 99; VBITS_LE_256-DAG: fadd [[RES_2:z[0-9]+]].h, [[PG]]/m, [[OP1_2]].h, [[OP2_2]].h 100; VBITS_LE_256-DAG: st1h { [[RES_2]].h }, [[PG]], [x[[A2]]] 101; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] 102; VBITS_LE_256-DAG: add x[[B3:[0-9]+]], x1, #[[#mul(VBYTES,3)]] 103; VBITS_LE_256-DAG: ld1h { [[OP1_3:z[0-9]+]].h }, [[PG]]/z, [x[[A3]]] 104; VBITS_LE_256-DAG: ld1h { [[OP2_3:z[0-9]+]].h }, [[PG]]/z, [x[[B3]]] 105; VBITS_LE_256-DAG: fadd [[RES_3:z[0-9]+]].h, [[PG]]/m, [[OP1_3]].h, [[OP2_3]].h 106; VBITS_LE_256-DAG: st1h { [[RES_3]].h }, [[PG]], [x[[A3]]] 107; CHECK: ret 108 %op1 = load <64 x half>, <64 x half>* %a 109 %op2 = load <64 x half>, <64 x half>* %b 110 %res = fadd <64 x half> %op1, %op2 111 store <64 x half> %res, <64 x half>* %a 112 ret void 113} 114 115; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 116; already cover the general legalisation cases. 117define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { 118; CHECK-LABEL: @fadd_v128f16 119; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 120; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 121; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 122; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 123; CHECK: st1h { [[RES]].h }, [[PG]], [x0] 124; CHECK: ret 125 %op1 = load <128 x half>, <128 x half>* %a 126 %op2 = load <128 x half>, <128 x half>* %b 127 %res = fadd <128 x half> %op1, %op2 128 store <128 x half> %res, <128 x half>* %a 129 ret void 130} 131 132; Don't use SVE for 64-bit vectors. 133define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { 134; CHECK-LABEL: @fadd_v2f32 135; CHECK: fadd v0.2s, v0.2s, v1.2s 136; CHECK: ret 137 %res = fadd <2 x float> %op1, %op2 138 ret <2 x float> %res 139} 140 141; Don't use SVE for 128-bit vectors. 142define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { 143; CHECK-LABEL: @fadd_v4f32 144; CHECK: fadd v0.4s, v0.4s, v1.4s 145; CHECK: ret 146 %res = fadd <4 x float> %op1, %op2 147 ret <4 x float> %res 148} 149 150define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 { 151; CHECK-LABEL: @fadd_v8f32 152; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 153; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 154; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 155; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 156; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 157; CHECK: ret 158 %op1 = load <8 x float>, <8 x float>* %a 159 %op2 = load <8 x float>, <8 x float>* %b 160 %res = fadd <8 x float> %op1, %op2 161 store <8 x float> %res, <8 x float>* %a 162 ret void 163} 164 165; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 166; already cover the general legalisation cases. 167define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { 168; CHECK-LABEL: @fadd_v16f32 169; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 170; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 171; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 172; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 173; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 174; CHECK: ret 175 %op1 = load <16 x float>, <16 x float>* %a 176 %op2 = load <16 x float>, <16 x float>* %b 177 %res = fadd <16 x float> %op1, %op2 178 store <16 x float> %res, <16 x float>* %a 179 ret void 180} 181 182; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 183; already cover the general legalisation cases. 184define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { 185; CHECK-LABEL: @fadd_v32f32 186; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 187; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 188; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 189; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 190; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 191; CHECK: ret 192 %op1 = load <32 x float>, <32 x float>* %a 193 %op2 = load <32 x float>, <32 x float>* %b 194 %res = fadd <32 x float> %op1, %op2 195 store <32 x float> %res, <32 x float>* %a 196 ret void 197} 198 199; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 200; already cover the general legalisation cases. 201define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { 202; CHECK-LABEL: @fadd_v64f32 203; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 204; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 205; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 206; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 207; CHECK: st1w { [[RES]].s }, [[PG]], [x0] 208; CHECK: ret 209 %op1 = load <64 x float>, <64 x float>* %a 210 %op2 = load <64 x float>, <64 x float>* %b 211 %res = fadd <64 x float> %op1, %op2 212 store <64 x float> %res, <64 x float>* %a 213 ret void 214} 215 216; Don't use SVE for 64-bit vectors. 217define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) #0 { 218; CHECK-LABEL: @fadd_v1f64 219; CHECK: fadd d0, d0, d1 220; CHECK: ret 221 %res = fadd <1 x double> %op1, %op2 222 ret <1 x double> %res 223} 224 225; Don't use SVE for 128-bit vectors. 226define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { 227; CHECK-LABEL: @fadd_v2f64 228; CHECK: fadd v0.2d, v0.2d, v1.2d 229; CHECK: ret 230 %res = fadd <2 x double> %op1, %op2 231 ret <2 x double> %res 232} 233 234define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { 235; CHECK-LABEL: @fadd_v4f64 236; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 237; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 238; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 239; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 240; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 241; CHECK: ret 242 %op1 = load <4 x double>, <4 x double>* %a 243 %op2 = load <4 x double>, <4 x double>* %b 244 %res = fadd <4 x double> %op1, %op2 245 store <4 x double> %res, <4 x double>* %a 246 ret void 247} 248 249; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 250; already cover the general legalisation cases. 251define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { 252; CHECK-LABEL: @fadd_v8f64 253; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 254; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 255; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 256; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 257; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 258; CHECK: ret 259 %op1 = load <8 x double>, <8 x double>* %a 260 %op2 = load <8 x double>, <8 x double>* %b 261 %res = fadd <8 x double> %op1, %op2 262 store <8 x double> %res, <8 x double>* %a 263 ret void 264} 265 266; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 267; already cover the general legalisation cases. 268define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { 269; CHECK-LABEL: @fadd_v16f64 270; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 271; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 272; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 273; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 274; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 275; CHECK: ret 276 %op1 = load <16 x double>, <16 x double>* %a 277 %op2 = load <16 x double>, <16 x double>* %b 278 %res = fadd <16 x double> %op1, %op2 279 store <16 x double> %res, <16 x double>* %a 280 ret void 281} 282 283; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests 284; already cover the general legalisation cases. 285define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { 286; CHECK-LABEL: @fadd_v32f64 287; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 288; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 289; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 290; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 291; CHECK: st1d { [[RES]].d }, [[PG]], [x0] 292; CHECK: ret 293 %op1 = load <32 x double>, <32 x double>* %a 294 %op2 = load <32 x double>, <32 x double>* %b 295 %res = fadd <32 x double> %op1, %op2 296 store <32 x double> %res, <32 x double>* %a 297 ret void 298} 299 300attributes #0 = { "target-features"="+sve" } 301