1; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
4; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
17
18; VBYTES represents the useful byte size of a vector register from the code
19; generator's point of view. It is clamped to power-of-2 values because
20; only power-of-2 vector lengths are considered legal, regardless of the
21; user specified vector length.
22
23target triple = "aarch64-unknown-linux-gnu"
24
25; Don't use SVE when its registers are no bigger than NEON.
26; NO_SVE-NOT: ptrue
27
28; Don't use SVE for 64-bit vectors.
29define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
30; CHECK-LABEL: @fadd_v4f16
31; CHECK: fadd v0.4h, v0.4h, v1.4h
32; CHECK: ret
33  %res = fadd <4 x half> %op1, %op2
34  ret <4 x half> %res
35}
36
37; Don't use SVE for 128-bit vectors.
38define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
39; CHECK-LABEL: @fadd_v8f16
40; CHECK: fadd v0.8h, v0.8h, v1.8h
41; CHECK: ret
42  %res = fadd <8 x half> %op1, %op2
43  ret <8 x half> %res
44}
45
46define void @fadd_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
47; CHECK-LABEL: @fadd_v16f16
48; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
49; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
50; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
51; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
52; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
53; CHECK: ret
54  %op1 = load <16 x half>, <16 x half>* %a
55  %op2 = load <16 x half>, <16 x half>* %b
56  %res = fadd <16 x half> %op1, %op2
57  store <16 x half> %res, <16 x half>* %a
58  ret void
59}
60
61define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
62; CHECK-LABEL: @fadd_v32f16
63; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
64; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
65; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
66; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
67; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
68; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
69; VBITS_LE_256-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]]
70; VBITS_LE_256-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]]
71; VBITS_LE_256-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]]
72; VBITS_LE_256-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
73; VBITS_LE_256-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]]
74; CHECK: ret
75  %op1 = load <32 x half>, <32 x half>* %a
76  %op2 = load <32 x half>, <32 x half>* %b
77  %res = fadd <32 x half> %op1, %op2
78  store <32 x half> %res, <32 x half>* %a
79  ret void
80}
81
82define void @fadd_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
83; CHECK-LABEL: @fadd_v64f16
84; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
85; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
86; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
87; CHECK-DAG: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
88; CHECK-DAG: st1h { [[RES]].h }, [[PG]], [x0]
89; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
90; VBITS_LE_512-DAG: add x[[B1:[0-9]+]], x1, #[[#VBYTES]]
91; VBITS_LE_512-DAG: ld1h { [[OP1_1:z[0-9]+]].h }, [[PG]]/z, [x[[A1]]]
92; VBITS_LE_512-DAG: ld1h { [[OP2_1:z[0-9]+]].h }, [[PG]]/z, [x[[B1]]]
93; VBITS_LE_512-DAG: fadd [[RES_1:z[0-9]+]].h, [[PG]]/m, [[OP1_1]].h, [[OP2_1]].h
94; VBITS_LE_512-DAG: st1h { [[RES_1]].h }, [[PG]], [x[[A1]]]
95; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
96; VBITS_LE_256-DAG: add x[[B2:[0-9]+]], x1, #[[#mul(VBYTES,2)]]
97; VBITS_LE_256-DAG: ld1h { [[OP1_2:z[0-9]+]].h }, [[PG]]/z, [x[[A2]]]
98; VBITS_LE_256-DAG: ld1h { [[OP2_2:z[0-9]+]].h }, [[PG]]/z, [x[[B2]]]
99; VBITS_LE_256-DAG: fadd [[RES_2:z[0-9]+]].h, [[PG]]/m, [[OP1_2]].h, [[OP2_2]].h
100; VBITS_LE_256-DAG: st1h { [[RES_2]].h }, [[PG]], [x[[A2]]]
101; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
102; VBITS_LE_256-DAG: add x[[B3:[0-9]+]], x1, #[[#mul(VBYTES,3)]]
103; VBITS_LE_256-DAG: ld1h { [[OP1_3:z[0-9]+]].h }, [[PG]]/z, [x[[A3]]]
104; VBITS_LE_256-DAG: ld1h { [[OP2_3:z[0-9]+]].h }, [[PG]]/z, [x[[B3]]]
105; VBITS_LE_256-DAG: fadd [[RES_3:z[0-9]+]].h, [[PG]]/m, [[OP1_3]].h, [[OP2_3]].h
106; VBITS_LE_256-DAG: st1h { [[RES_3]].h }, [[PG]], [x[[A3]]]
107; CHECK: ret
108  %op1 = load <64 x half>, <64 x half>* %a
109  %op2 = load <64 x half>, <64 x half>* %b
110  %res = fadd <64 x half> %op1, %op2
111  store <64 x half> %res, <64 x half>* %a
112  ret void
113}
114
115; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
116; already cover the general legalisation cases.
117define void @fadd_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
118; CHECK-LABEL: @fadd_v128f16
119; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
120; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
121; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
122; CHECK: fadd [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
123; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
124; CHECK: ret
125  %op1 = load <128 x half>, <128 x half>* %a
126  %op2 = load <128 x half>, <128 x half>* %b
127  %res = fadd <128 x half> %op1, %op2
128  store <128 x half> %res, <128 x half>* %a
129  ret void
130}
131
132; Don't use SVE for 64-bit vectors.
133define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
134; CHECK-LABEL: @fadd_v2f32
135; CHECK: fadd v0.2s, v0.2s, v1.2s
136; CHECK: ret
137  %res = fadd <2 x float> %op1, %op2
138  ret <2 x float> %res
139}
140
141; Don't use SVE for 128-bit vectors.
142define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
143; CHECK-LABEL: @fadd_v4f32
144; CHECK: fadd v0.4s, v0.4s, v1.4s
145; CHECK: ret
146  %res = fadd <4 x float> %op1, %op2
147  ret <4 x float> %res
148}
149
150define void @fadd_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
151; CHECK-LABEL: @fadd_v8f32
152; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
153; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
154; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
155; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
156; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
157; CHECK: ret
158  %op1 = load <8 x float>, <8 x float>* %a
159  %op2 = load <8 x float>, <8 x float>* %b
160  %res = fadd <8 x float> %op1, %op2
161  store <8 x float> %res, <8 x float>* %a
162  ret void
163}
164
165; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
166; already cover the general legalisation cases.
167define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
168; CHECK-LABEL: @fadd_v16f32
169; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
170; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
171; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
172; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
173; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
174; CHECK: ret
175  %op1 = load <16 x float>, <16 x float>* %a
176  %op2 = load <16 x float>, <16 x float>* %b
177  %res = fadd <16 x float> %op1, %op2
178  store <16 x float> %res, <16 x float>* %a
179  ret void
180}
181
182; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
183; already cover the general legalisation cases.
184define void @fadd_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
185; CHECK-LABEL: @fadd_v32f32
186; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
187; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
188; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
189; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
190; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
191; CHECK: ret
192  %op1 = load <32 x float>, <32 x float>* %a
193  %op2 = load <32 x float>, <32 x float>* %b
194  %res = fadd <32 x float> %op1, %op2
195  store <32 x float> %res, <32 x float>* %a
196  ret void
197}
198
199; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
200; already cover the general legalisation cases.
201define void @fadd_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
202; CHECK-LABEL: @fadd_v64f32
203; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
204; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
205; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
206; CHECK: fadd [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
207; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
208; CHECK: ret
209  %op1 = load <64 x float>, <64 x float>* %a
210  %op2 = load <64 x float>, <64 x float>* %b
211  %res = fadd <64 x float> %op1, %op2
212  store <64 x float> %res, <64 x float>* %a
213  ret void
214}
215
216; Don't use SVE for 64-bit vectors.
217define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
218; CHECK-LABEL: @fadd_v1f64
219; CHECK: fadd d0, d0, d1
220; CHECK: ret
221  %res = fadd <1 x double> %op1, %op2
222  ret <1 x double> %res
223}
224
225; Don't use SVE for 128-bit vectors.
226define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
227; CHECK-LABEL: @fadd_v2f64
228; CHECK: fadd v0.2d, v0.2d, v1.2d
229; CHECK: ret
230  %res = fadd <2 x double> %op1, %op2
231  ret <2 x double> %res
232}
233
234define void @fadd_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
235; CHECK-LABEL: @fadd_v4f64
236; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
237; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
238; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
239; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
240; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
241; CHECK: ret
242  %op1 = load <4 x double>, <4 x double>* %a
243  %op2 = load <4 x double>, <4 x double>* %b
244  %res = fadd <4 x double> %op1, %op2
245  store <4 x double> %res, <4 x double>* %a
246  ret void
247}
248
249; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
250; already cover the general legalisation cases.
251define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
252; CHECK-LABEL: @fadd_v8f64
253; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
254; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
255; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
256; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
257; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
258; CHECK: ret
259  %op1 = load <8 x double>, <8 x double>* %a
260  %op2 = load <8 x double>, <8 x double>* %b
261  %res = fadd <8 x double> %op1, %op2
262  store <8 x double> %res, <8 x double>* %a
263  ret void
264}
265
266; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
267; already cover the general legalisation cases.
268define void @fadd_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
269; CHECK-LABEL: @fadd_v16f64
270; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
271; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
272; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
273; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
274; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
275; CHECK: ret
276  %op1 = load <16 x double>, <16 x double>* %a
277  %op2 = load <16 x double>, <16 x double>* %b
278  %res = fadd <16 x double> %op1, %op2
279  store <16 x double> %res, <16 x double>* %a
280  ret void
281}
282
283; NOTE: Check lines only cover the first VBYTES because the add_v#f16 tests
284; already cover the general legalisation cases.
285define void @fadd_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
286; CHECK-LABEL: @fadd_v32f64
287; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
288; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
289; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
290; CHECK: fadd [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
291; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
292; CHECK: ret
293  %op1 = load <32 x double>, <32 x double>* %a
294  %op2 = load <32 x double>, <32 x double>* %b
295  %res = fadd <32 x double> %op1, %op2
296  store <32 x double> %res, <32 x double>* %a
297  ret void
298}
299
300attributes #0 = { "target-features"="+sve" }
301