1 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
2 /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
3 /* { dg-final { check-function-bodies "**" "" } } */
4
5 typedef unsigned char v128qi __attribute__((vector_size(128)));
6 typedef unsigned char v64qi __attribute__((vector_size(64)));
7 typedef unsigned char v32qi __attribute__((vector_size(32)));
8 typedef unsigned short v64hi __attribute__((vector_size(128)));
9 typedef unsigned short v32hi __attribute__((vector_size(64)));
10 typedef _Float16 v64hf __attribute__((vector_size(128)));
11 typedef _Float16 v32hf __attribute__((vector_size(64)));
12 typedef __bf16 v64bf __attribute__((vector_size(128)));
13 typedef __bf16 v32bf __attribute__((vector_size(64)));
14 typedef unsigned int v32si __attribute__((vector_size(128)));
15 typedef float v32sf __attribute__((vector_size(128)));
16
17 #define PERM0(B) B, B
18 #define PERM1(B) PERM0 (B), PERM0 (B)
19 #define PERM2(B) PERM1 (B), PERM1 (B)
20 #define PERM3(B) PERM2 (B), PERM2 (B)
21 #define PERM4(B) PERM3 (B), PERM3 (B)
22 #define PERM5(B) PERM4 (B), PERM4 (B)
23 #define PERM6(B) PERM5 (B), PERM5 (B)
24
25 /*
26 ** qi_dup_h_1:
27 ** ptrue (p[0-7])\.b, vl256
28 ** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
29 ** dup (z[0-9]+)\.h, \2\.h\[1\]
30 ** st1b \3\.h, \1, \[x8\]
31 ** ret
32 */
33 v128qi
qi_dup_h_1(v128qi x)34 qi_dup_h_1 (v128qi x)
35 {
36 return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
37 }
38
39 /*
40 ** qi_dup_h_31:
41 ** ptrue (p[0-7])\.b, vl256
42 ** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
43 ** dup (z[0-9]+)\.h, \2\.h\[31\]
44 ** st1b \3\.h, \1, \[x8\]
45 ** ret
46 */
47 v128qi
qi_dup_h_31(v128qi x)48 qi_dup_h_31 (v128qi x)
49 {
50 return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
51 }
52
53 /*
54 ** qi_dup_s_1:
55 ** ptrue (p[0-7])\.b, vl256
56 ** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
57 ** dup (z[0-9]+)\.s, \2\.s\[1\]
58 ** st1b \3\.s, \1, \[x8\]
59 ** ret
60 */
61 v64qi
qi_dup_s_1(v64qi x)62 qi_dup_s_1 (v64qi x)
63 {
64 return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
65 }
66
67 /*
68 ** qi_dup_s_15:
69 ** ptrue (p[0-7])\.b, vl256
70 ** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
71 ** dup (z[0-9]+)\.s, \2\.s\[15\]
72 ** st1b \3\.s, \1, \[x8\]
73 ** ret
74 */
75 v64qi
qi_dup_s_15(v64qi x)76 qi_dup_s_15 (v64qi x)
77 {
78 return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
79 }
80
81 /*
82 ** qi_dup_d_1:
83 ** ptrue (p[0-7])\.b, vl256
84 ** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
85 ** dup (z[0-9]+)\.d, \2\.d\[1\]
86 ** st1b \3\.d, \1, \[x8\]
87 ** ret
88 */
89 v32qi
qi_dup_d_1(v32qi x)90 qi_dup_d_1 (v32qi x)
91 {
92 return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
93 }
94
95 /*
96 ** qi_dup_d_7:
97 ** ptrue (p[0-7])\.b, vl256
98 ** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
99 ** dup (z[0-9]+)\.d, \2\.d\[7\]
100 ** st1b \3\.d, \1, \[x8\]
101 ** ret
102 */
103 v32qi
qi_dup_d_7(v32qi x)104 qi_dup_d_7 (v32qi x)
105 {
106 return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
107 }
108
109 /*
110 ** hi_dup_s_1:
111 ** ptrue (p[0-7])\.b, vl256
112 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
113 ** dup (z[0-9]+)\.s, \2\.s\[1\]
114 ** st1h \3\.s, \1, \[x8\]
115 ** ret
116 */
117 v64hi
hi_dup_s_1(v64hi x)118 hi_dup_s_1 (v64hi x)
119 {
120 return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
121 }
122
123 /*
124 ** hi_dup_s_15:
125 ** ptrue (p[0-7])\.b, vl256
126 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
127 ** dup (z[0-9]+)\.s, \2\.s\[15\]
128 ** st1h \3\.s, \1, \[x8\]
129 ** ret
130 */
131 v64hi
hi_dup_s_15(v64hi x)132 hi_dup_s_15 (v64hi x)
133 {
134 return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
135 }
136
137 /*
138 ** hf_dup_s_1:
139 ** ptrue (p[0-7])\.b, vl256
140 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
141 ** dup (z[0-9]+)\.s, \2\.s\[1\]
142 ** st1h \3\.s, \1, \[x8\]
143 ** ret
144 */
145 v64hf
hf_dup_s_1(v64hf x)146 hf_dup_s_1 (v64hf x)
147 {
148 return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
149 }
150
151 /*
152 ** hf_dup_s_11:
153 ** ptrue (p[0-7])\.b, vl256
154 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
155 ** dup (z[0-9]+)\.s, \2\.s\[11\]
156 ** st1h \3\.s, \1, \[x8\]
157 ** ret
158 */
159 v64hf
hf_dup_s_11(v64hf x)160 hf_dup_s_11 (v64hf x)
161 {
162 return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
163 }
164
165 /*
166 ** bf_dup_s_1:
167 ** ptrue (p[0-7])\.b, vl256
168 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
169 ** dup (z[0-9]+)\.s, \2\.s\[1\]
170 ** st1h \3\.s, \1, \[x8\]
171 ** ret
172 */
173 v64bf
bf_dup_s_1(v64bf x)174 bf_dup_s_1 (v64bf x)
175 {
176 return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
177 }
178
179 /*
180 ** bf_dup_s_13:
181 ** ptrue (p[0-7])\.b, vl256
182 ** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
183 ** dup (z[0-9]+)\.s, \2\.s\[13\]
184 ** st1h \3\.s, \1, \[x8\]
185 ** ret
186 */
187 v64bf
bf_dup_s_13(v64bf x)188 bf_dup_s_13 (v64bf x)
189 {
190 return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
191 }
192
193 /*
194 ** hi_dup_d_1:
195 ** ptrue (p[0-7])\.b, vl256
196 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
197 ** dup (z[0-9]+)\.d, \2\.d\[1\]
198 ** st1h \3\.d, \1, \[x8\]
199 ** ret
200 */
201 v32hi
hi_dup_d_1(v32hi x)202 hi_dup_d_1 (v32hi x)
203 {
204 return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
205 }
206
207 /*
208 ** hi_dup_d_7:
209 ** ptrue (p[0-7])\.b, vl256
210 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
211 ** dup (z[0-9]+)\.d, \2\.d\[7\]
212 ** st1h \3\.d, \1, \[x8\]
213 ** ret
214 */
215 v32hi
hi_dup_d_7(v32hi x)216 hi_dup_d_7 (v32hi x)
217 {
218 return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
219 }
220
221 /*
222 ** hf_dup_d_1:
223 ** ptrue (p[0-7])\.b, vl256
224 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
225 ** dup (z[0-9]+)\.d, \2\.d\[1\]
226 ** st1h \3\.d, \1, \[x8\]
227 ** ret
228 */
229 v32hf
hf_dup_d_1(v32hf x)230 hf_dup_d_1 (v32hf x)
231 {
232 return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
233 }
234
235 /*
236 ** hf_dup_d_5:
237 ** ptrue (p[0-7])\.b, vl256
238 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
239 ** dup (z[0-9]+)\.d, \2\.d\[5\]
240 ** st1h \3\.d, \1, \[x8\]
241 ** ret
242 */
243 v32hf
hf_dup_d_5(v32hf x)244 hf_dup_d_5 (v32hf x)
245 {
246 return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
247 }
248
249 /*
250 ** bf_dup_d_1:
251 ** ptrue (p[0-7])\.b, vl256
252 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
253 ** dup (z[0-9]+)\.d, \2\.d\[1\]
254 ** st1h \3\.d, \1, \[x8\]
255 ** ret
256 */
257 v32bf
bf_dup_d_1(v32bf x)258 bf_dup_d_1 (v32bf x)
259 {
260 return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
261 }
262
263 /*
264 ** bf_dup_d_6:
265 ** ptrue (p[0-7])\.b, vl256
266 ** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
267 ** dup (z[0-9]+)\.d, \2\.d\[6\]
268 ** st1h \3\.d, \1, \[x8\]
269 ** ret
270 */
271 v32bf
bf_dup_d_6(v32bf x)272 bf_dup_d_6 (v32bf x)
273 {
274 return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
275 }
276
277 /*
278 ** si_dup_d_1:
279 ** ptrue (p[0-7])\.b, vl256
280 ** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
281 ** dup (z[0-9]+)\.d, \2\.d\[1\]
282 ** st1w \3\.d, \1, \[x8\]
283 ** ret
284 */
285 v32si
si_dup_d_1(v32si x)286 si_dup_d_1 (v32si x)
287 {
288 return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
289 }
290
291 /*
292 ** si_dup_d_7:
293 ** ptrue (p[0-7])\.b, vl256
294 ** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
295 ** dup (z[0-9]+)\.d, \2\.d\[7\]
296 ** st1w \3\.d, \1, \[x8\]
297 ** ret
298 */
299 v32si
si_dup_d_7(v32si x)300 si_dup_d_7 (v32si x)
301 {
302 return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
303 }
304
305 /*
306 ** sf_dup_d_1:
307 ** ptrue (p[0-7])\.b, vl256
308 ** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
309 ** dup (z[0-9]+)\.d, \2\.d\[1\]
310 ** st1w \3\.d, \1, \[x8\]
311 ** ret
312 */
313 v32sf
sf_dup_d_1(v32sf x)314 sf_dup_d_1 (v32sf x)
315 {
316 return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
317 }
318
319 /*
320 ** sf_dup_d_7:
321 ** ptrue (p[0-7])\.b, vl256
322 ** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
323 ** dup (z[0-9]+)\.d, \2\.d\[7\]
324 ** st1w \3\.d, \1, \[x8\]
325 ** ret
326 */
327 v32sf
sf_dup_d_7(v32sf x)328 sf_dup_d_7 (v32sf x)
329 {
330 return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
331 }
332