1 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
2 /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
3 /* { dg-final { check-function-bodies "**" "" } } */
4 
5 typedef unsigned char v128qi __attribute__((vector_size(128)));
6 typedef unsigned char v64qi __attribute__((vector_size(64)));
7 typedef unsigned char v32qi __attribute__((vector_size(32)));
8 typedef unsigned short v64hi __attribute__((vector_size(128)));
9 typedef unsigned short v32hi __attribute__((vector_size(64)));
10 typedef _Float16 v64hf __attribute__((vector_size(128)));
11 typedef _Float16 v32hf __attribute__((vector_size(64)));
12 typedef __bf16 v64bf __attribute__((vector_size(128)));
13 typedef __bf16 v32bf __attribute__((vector_size(64)));
14 typedef unsigned int v32si __attribute__((vector_size(128)));
15 typedef float v32sf __attribute__((vector_size(128)));
16 
17 #define PERM0(B) B, B
18 #define PERM1(B) PERM0 (B), PERM0 (B)
19 #define PERM2(B) PERM1 (B), PERM1 (B)
20 #define PERM3(B) PERM2 (B), PERM2 (B)
21 #define PERM4(B) PERM3 (B), PERM3 (B)
22 #define PERM5(B) PERM4 (B), PERM4 (B)
23 #define PERM6(B) PERM5 (B), PERM5 (B)
24 
25 /*
26 ** qi_dup_h_1:
27 **	ptrue	(p[0-7])\.b, vl256
28 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
29 **	dup	(z[0-9]+)\.h, \2\.h\[1\]
30 **	st1b	\3\.h, \1, \[x8\]
31 **	ret
32 */
33 v128qi
qi_dup_h_1(v128qi x)34 qi_dup_h_1 (v128qi x)
35 {
36   return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
37 }
38 
39 /*
40 ** qi_dup_h_31:
41 **	ptrue	(p[0-7])\.b, vl256
42 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
43 **	dup	(z[0-9]+)\.h, \2\.h\[31\]
44 **	st1b	\3\.h, \1, \[x8\]
45 **	ret
46 */
47 v128qi
qi_dup_h_31(v128qi x)48 qi_dup_h_31 (v128qi x)
49 {
50   return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
51 }
52 
53 /*
54 ** qi_dup_s_1:
55 **	ptrue	(p[0-7])\.b, vl256
56 **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
57 **	dup	(z[0-9]+)\.s, \2\.s\[1\]
58 **	st1b	\3\.s, \1, \[x8\]
59 **	ret
60 */
61 v64qi
qi_dup_s_1(v64qi x)62 qi_dup_s_1 (v64qi x)
63 {
64   return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
65 }
66 
67 /*
68 ** qi_dup_s_15:
69 **	ptrue	(p[0-7])\.b, vl256
70 **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
71 **	dup	(z[0-9]+)\.s, \2\.s\[15\]
72 **	st1b	\3\.s, \1, \[x8\]
73 **	ret
74 */
75 v64qi
qi_dup_s_15(v64qi x)76 qi_dup_s_15 (v64qi x)
77 {
78   return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
79 }
80 
81 /*
82 ** qi_dup_d_1:
83 **	ptrue	(p[0-7])\.b, vl256
84 **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
85 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
86 **	st1b	\3\.d, \1, \[x8\]
87 **	ret
88 */
89 v32qi
qi_dup_d_1(v32qi x)90 qi_dup_d_1 (v32qi x)
91 {
92   return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
93 }
94 
95 /*
96 ** qi_dup_d_7:
97 **	ptrue	(p[0-7])\.b, vl256
98 **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
99 **	dup	(z[0-9]+)\.d, \2\.d\[7\]
100 **	st1b	\3\.d, \1, \[x8\]
101 **	ret
102 */
103 v32qi
qi_dup_d_7(v32qi x)104 qi_dup_d_7 (v32qi x)
105 {
106   return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
107 }
108 
109 /*
110 ** hi_dup_s_1:
111 **	ptrue	(p[0-7])\.b, vl256
112 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
113 **	dup	(z[0-9]+)\.s, \2\.s\[1\]
114 **	st1h	\3\.s, \1, \[x8\]
115 **	ret
116 */
117 v64hi
hi_dup_s_1(v64hi x)118 hi_dup_s_1 (v64hi x)
119 {
120   return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
121 }
122 
123 /*
124 ** hi_dup_s_15:
125 **	ptrue	(p[0-7])\.b, vl256
126 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
127 **	dup	(z[0-9]+)\.s, \2\.s\[15\]
128 **	st1h	\3\.s, \1, \[x8\]
129 **	ret
130 */
131 v64hi
hi_dup_s_15(v64hi x)132 hi_dup_s_15 (v64hi x)
133 {
134   return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
135 }
136 
137 /*
138 ** hf_dup_s_1:
139 **	ptrue	(p[0-7])\.b, vl256
140 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
141 **	dup	(z[0-9]+)\.s, \2\.s\[1\]
142 **	st1h	\3\.s, \1, \[x8\]
143 **	ret
144 */
145 v64hf
hf_dup_s_1(v64hf x)146 hf_dup_s_1 (v64hf x)
147 {
148   return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
149 }
150 
151 /*
152 ** hf_dup_s_11:
153 **	ptrue	(p[0-7])\.b, vl256
154 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
155 **	dup	(z[0-9]+)\.s, \2\.s\[11\]
156 **	st1h	\3\.s, \1, \[x8\]
157 **	ret
158 */
159 v64hf
hf_dup_s_11(v64hf x)160 hf_dup_s_11 (v64hf x)
161 {
162   return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
163 }
164 
165 /*
166 ** bf_dup_s_1:
167 **	ptrue	(p[0-7])\.b, vl256
168 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
169 **	dup	(z[0-9]+)\.s, \2\.s\[1\]
170 **	st1h	\3\.s, \1, \[x8\]
171 **	ret
172 */
173 v64bf
bf_dup_s_1(v64bf x)174 bf_dup_s_1 (v64bf x)
175 {
176   return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
177 }
178 
179 /*
180 ** bf_dup_s_13:
181 **	ptrue	(p[0-7])\.b, vl256
182 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
183 **	dup	(z[0-9]+)\.s, \2\.s\[13\]
184 **	st1h	\3\.s, \1, \[x8\]
185 **	ret
186 */
187 v64bf
bf_dup_s_13(v64bf x)188 bf_dup_s_13 (v64bf x)
189 {
190   return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
191 }
192 
193 /*
194 ** hi_dup_d_1:
195 **	ptrue	(p[0-7])\.b, vl256
196 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
197 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
198 **	st1h	\3\.d, \1, \[x8\]
199 **	ret
200 */
201 v32hi
hi_dup_d_1(v32hi x)202 hi_dup_d_1 (v32hi x)
203 {
204   return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
205 }
206 
207 /*
208 ** hi_dup_d_7:
209 **	ptrue	(p[0-7])\.b, vl256
210 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
211 **	dup	(z[0-9]+)\.d, \2\.d\[7\]
212 **	st1h	\3\.d, \1, \[x8\]
213 **	ret
214 */
215 v32hi
hi_dup_d_7(v32hi x)216 hi_dup_d_7 (v32hi x)
217 {
218   return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
219 }
220 
221 /*
222 ** hf_dup_d_1:
223 **	ptrue	(p[0-7])\.b, vl256
224 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
225 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
226 **	st1h	\3\.d, \1, \[x8\]
227 **	ret
228 */
229 v32hf
hf_dup_d_1(v32hf x)230 hf_dup_d_1 (v32hf x)
231 {
232   return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
233 }
234 
235 /*
236 ** hf_dup_d_5:
237 **	ptrue	(p[0-7])\.b, vl256
238 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
239 **	dup	(z[0-9]+)\.d, \2\.d\[5\]
240 **	st1h	\3\.d, \1, \[x8\]
241 **	ret
242 */
243 v32hf
hf_dup_d_5(v32hf x)244 hf_dup_d_5 (v32hf x)
245 {
246   return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
247 }
248 
249 /*
250 ** bf_dup_d_1:
251 **	ptrue	(p[0-7])\.b, vl256
252 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
253 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
254 **	st1h	\3\.d, \1, \[x8\]
255 **	ret
256 */
257 v32bf
bf_dup_d_1(v32bf x)258 bf_dup_d_1 (v32bf x)
259 {
260   return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
261 }
262 
263 /*
264 ** bf_dup_d_6:
265 **	ptrue	(p[0-7])\.b, vl256
266 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
267 **	dup	(z[0-9]+)\.d, \2\.d\[6\]
268 **	st1h	\3\.d, \1, \[x8\]
269 **	ret
270 */
271 v32bf
bf_dup_d_6(v32bf x)272 bf_dup_d_6 (v32bf x)
273 {
274   return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
275 }
276 
277 /*
278 ** si_dup_d_1:
279 **	ptrue	(p[0-7])\.b, vl256
280 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
281 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
282 **	st1w	\3\.d, \1, \[x8\]
283 **	ret
284 */
285 v32si
si_dup_d_1(v32si x)286 si_dup_d_1 (v32si x)
287 {
288   return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
289 }
290 
291 /*
292 ** si_dup_d_7:
293 **	ptrue	(p[0-7])\.b, vl256
294 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
295 **	dup	(z[0-9]+)\.d, \2\.d\[7\]
296 **	st1w	\3\.d, \1, \[x8\]
297 **	ret
298 */
299 v32si
si_dup_d_7(v32si x)300 si_dup_d_7 (v32si x)
301 {
302   return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
303 }
304 
305 /*
306 ** sf_dup_d_1:
307 **	ptrue	(p[0-7])\.b, vl256
308 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
309 **	dup	(z[0-9]+)\.d, \2\.d\[1\]
310 **	st1w	\3\.d, \1, \[x8\]
311 **	ret
312 */
313 v32sf
sf_dup_d_1(v32sf x)314 sf_dup_d_1 (v32sf x)
315 {
316   return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
317 }
318 
319 /*
320 ** sf_dup_d_7:
321 **	ptrue	(p[0-7])\.b, vl256
322 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
323 **	dup	(z[0-9]+)\.d, \2\.d\[7\]
324 **	st1w	\3\.d, \1, \[x8\]
325 **	ret
326 */
327 v32sf
sf_dup_d_7(v32sf x)328 sf_dup_d_7 (v32sf x)
329 {
330   return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
331 }
332