1 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
2 /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
3 /* { dg-final { check-function-bodies "**" "" } } */
4 
5 typedef unsigned char v128qi __attribute__((vector_size(128)));
6 typedef unsigned char v64qi __attribute__((vector_size(64)));
7 typedef unsigned char v32qi __attribute__((vector_size(32)));
8 typedef unsigned short v64hi __attribute__((vector_size(128)));
9 typedef unsigned short v32hi __attribute__((vector_size(64)));
10 typedef _Float16 v64hf __attribute__((vector_size(128)));
11 typedef _Float16 v32hf __attribute__((vector_size(64)));
12 typedef __bf16 v64bf __attribute__((vector_size(128)));
13 typedef __bf16 v32bf __attribute__((vector_size(64)));
14 typedef unsigned int v32si __attribute__((vector_size(128)));
15 typedef float v32sf __attribute__((vector_size(128)));
16 
17 #define PERM0(B, C) B, B + C
18 #define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
19 #define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
20 #define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
21 #define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
22 #define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
23 #define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
24 
25 /*
26 ** qi_zip1_h_a:
27 **	ptrue	(p[0-7])\.b, vl256
28 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
29 **	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
30 **	st1b	\3\.h, \1, \[x8\]
31 **	ret
32 */
33 v128qi
qi_zip1_h_a(v128qi x)34 qi_zip1_h_a (v128qi x)
35 {
36   return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
37 }
38 
39 /*
40 ** qi_zip1_h_b:
41 **	ptrue	(p[0-7])\.b, vl256
42 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
43 **	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
44 **	st1b	\3\.h, \1, \[x8\]
45 **	ret
46 */
47 v128qi
qi_zip1_h_b(v128qi x)48 qi_zip1_h_b (v128qi x)
49 {
50   return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
51 }
52 
53 /*
54 ** qi_zip1_h_c:
55 **	ptrue	(p[0-7])\.b, vl256
56 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
57 **	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
58 **	st1b	\3\.h, \1, \[x8\]
59 **	ret
60 */
61 v128qi
qi_zip1_h_c(v128qi x)62 qi_zip1_h_c (v128qi x)
63 {
64   return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
65 }
66 
67 /*
68 ** qi_zip1_h_two_op:
69 **	ptrue	(p[0-7])\.b, vl256
70 ** (
71 **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
72 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
73 **	zip1	\3\.h, \3\.h, \2\.h
74 **	st1b	\3\.h, \1, \[x8\]
75 ** |
76 **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
77 **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
78 **	zip1	\4\.h, \4\.h, \5\.h
79 **	st1b	\4\.h, \1, \[x8\]
80 ** )
81 **	ret
82 */
83 v128qi
qi_zip1_h_two_op(v128qi x,v128qi y)84 qi_zip1_h_two_op (v128qi x, v128qi y)
85 {
86   return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
87 }
88 
89 /*
90 ** qi_zip1_s:
91 **	ptrue	(p[0-7])\.b, vl256
92 **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
93 **	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
94 **	st1b	\3\.s, \1, \[x8\]
95 **	ret
96 */
97 v64qi
qi_zip1_s(v64qi x)98 qi_zip1_s (v64qi x)
99 {
100   return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
101 }
102 
103 /*
104 ** qi_zip1_s_two_op:
105 **	ptrue	(p[0-7])\.b, vl256
106 ** (
107 **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
108 **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
109 **	zip1	\3\.s, \3\.s, \2\.s
110 **	st1b	\3\.s, \1, \[x8\]
111 ** |
112 **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
113 **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
114 **	zip1	\4\.s, \4\.s, \5\.s
115 **	st1b	\4\.s, \1, \[x8\]
116 ** )
117 **	ret
118 */
119 v64qi
qi_zip1_s_two_op(v64qi x,v64qi y)120 qi_zip1_s_two_op (v64qi x, v64qi y)
121 {
122   return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
123 }
124 
125 /*
126 ** qi_zip1_d:
127 **	ptrue	(p[0-7])\.b, vl256
128 **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
129 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
130 **	st1b	\3\.d, \1, \[x8\]
131 **	ret
132 */
133 v32qi
qi_zip1_d(v32qi x)134 qi_zip1_d (v32qi x)
135 {
136   return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
137 }
138 
139 /*
140 ** qi_zip1_d_two_op:
141 **	ptrue	(p[0-7])\.b, vl256
142 ** (
143 **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
144 **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
145 **	zip1	\3\.d, \3\.d, \2\.d
146 **	st1b	\3\.d, \1, \[x8\]
147 ** |
148 **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
149 **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
150 **	zip1	\4\.d, \4\.d, \5\.d
151 **	st1b	\4\.d, \1, \[x8\]
152 ** )
153 **	ret
154 */
155 v32qi
qi_zip1_d_two_op(v32qi x,v32qi y)156 qi_zip1_d_two_op (v32qi x, v32qi y)
157 {
158   return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
159 }
160 
161 /*
162 ** hi_zip1_s:
163 **	ptrue	(p[0-7])\.b, vl256
164 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
165 **	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
166 **	st1h	\3\.s, \1, \[x8\]
167 **	ret
168 */
169 v64hi
hi_zip1_s(v64hi x)170 hi_zip1_s (v64hi x)
171 {
172   return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
173 }
174 
175 /*
176 ** hi_zip1_s_two_op:
177 **	ptrue	(p[0-7])\.b, vl256
178 ** (
179 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
180 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
181 **	zip1	\3\.s, \3\.s, \2\.s
182 **	st1h	\3\.s, \1, \[x8\]
183 ** |
184 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
185 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
186 **	zip1	\4\.s, \4\.s, \5\.s
187 **	st1h	\4\.s, \1, \[x8\]
188 ** )
189 **	ret
190 */
191 v64hi
hi_zip1_s_two_op(v64hi x,v64hi y)192 hi_zip1_s_two_op (v64hi x, v64hi y)
193 {
194   return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
195 }
196 
197 /*
198 ** hf_zip1_s:
199 **	ptrue	(p[0-7])\.b, vl256
200 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
201 **	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
202 **	st1h	\3\.s, \1, \[x8\]
203 **	ret
204 */
205 v64hf
hf_zip1_s(v64hf x)206 hf_zip1_s (v64hf x)
207 {
208   return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
209 }
210 
211 /*
212 ** hf_zip1_s_two_op:
213 **	ptrue	(p[0-7])\.b, vl256
214 ** (
215 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
216 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
217 **	zip1	\3\.s, \3\.s, \2\.s
218 **	st1h	\3\.s, \1, \[x8\]
219 ** |
220 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
221 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
222 **	zip1	\4\.s, \4\.s, \5\.s
223 **	st1h	\4\.s, \1, \[x8\]
224 ** )
225 **	ret
226 */
227 v64hf
hf_zip1_s_two_op(v64hf x,v64hf y)228 hf_zip1_s_two_op (v64hf x, v64hf y)
229 {
230   return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
231 }
232 
233 /*
234 ** bf_zip1_s:
235 **	ptrue	(p[0-7])\.b, vl256
236 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
237 **	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
238 **	st1h	\3\.s, \1, \[x8\]
239 **	ret
240 */
241 v64bf
bf_zip1_s(v64bf x)242 bf_zip1_s (v64bf x)
243 {
244   return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
245 }
246 
247 /*
248 ** bf_zip1_s_two_op:
249 **	ptrue	(p[0-7])\.b, vl256
250 ** (
251 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
252 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
253 **	zip1	\3\.s, \3\.s, \2\.s
254 **	st1h	\3\.s, \1, \[x8\]
255 ** |
256 **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
257 **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
258 **	zip1	\4\.s, \4\.s, \5\.s
259 **	st1h	\4\.s, \1, \[x8\]
260 ** )
261 **	ret
262 */
263 v64bf
bf_zip1_s_two_op(v64bf x,v64bf y)264 bf_zip1_s_two_op (v64bf x, v64bf y)
265 {
266   return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
267 }
268 
269 /*
270 ** hi_zip1_d:
271 **	ptrue	(p[0-7])\.b, vl256
272 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
273 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
274 **	st1h	\3\.d, \1, \[x8\]
275 **	ret
276 */
277 v32hi
hi_zip1_d(v32hi x)278 hi_zip1_d (v32hi x)
279 {
280   return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
281 }
282 
283 /*
284 ** hi_zip1_d_two_op:
285 **	ptrue	(p[0-7])\.b, vl256
286 ** (
287 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
288 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
289 **	zip1	\3\.d, \3\.d, \2\.d
290 **	st1h	\3\.d, \1, \[x8\]
291 ** |
292 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
293 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
294 **	zip1	\4\.d, \4\.d, \5\.d
295 **	st1h	\4\.d, \1, \[x8\]
296 ** )
297 **	ret
298 */
299 v32hi
hi_zip1_d_two_op(v32hi x,v32hi y)300 hi_zip1_d_two_op (v32hi x, v32hi y)
301 {
302   return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
303 }
304 
305 /*
306 ** hf_zip1_d:
307 **	ptrue	(p[0-7])\.b, vl256
308 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
309 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
310 **	st1h	\3\.d, \1, \[x8\]
311 **	ret
312 */
313 v32hf
hf_zip1_d(v32hf x)314 hf_zip1_d (v32hf x)
315 {
316   return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
317 }
318 
319 /*
320 ** hf_zip1_d_two_op:
321 **	ptrue	(p[0-7])\.b, vl256
322 ** (
323 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
324 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
325 **	zip1	\3\.d, \3\.d, \2\.d
326 **	st1h	\3\.d, \1, \[x8\]
327 ** |
328 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
329 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
330 **	zip1	\4\.d, \4\.d, \5\.d
331 **	st1h	\4\.d, \1, \[x8\]
332 ** )
333 **	ret
334 */
335 v32hf
hf_zip1_d_two_op(v32hf x,v32hf y)336 hf_zip1_d_two_op (v32hf x, v32hf y)
337 {
338   return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
339 }
340 
341 /*
342 ** bf_zip1_d:
343 **	ptrue	(p[0-7])\.b, vl256
344 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
345 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
346 **	st1h	\3\.d, \1, \[x8\]
347 **	ret
348 */
349 v32bf
bf_zip1_d(v32bf x)350 bf_zip1_d (v32bf x)
351 {
352   return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
353 }
354 
355 /*
356 ** bf_zip1_d_two_op:
357 **	ptrue	(p[0-7])\.b, vl256
358 ** (
359 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
360 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
361 **	zip1	\3\.d, \3\.d, \2\.d
362 **	st1h	\3\.d, \1, \[x8\]
363 ** |
364 **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
365 **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
366 **	zip1	\4\.d, \4\.d, \5\.d
367 **	st1h	\4\.d, \1, \[x8\]
368 ** )
369 **	ret
370 */
371 v32bf
bf_zip1_d_two_op(v32bf x,v32bf y)372 bf_zip1_d_two_op (v32bf x, v32bf y)
373 {
374   return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
375 }
376 
377 /*
378 ** si_zip1_d:
379 **	ptrue	(p[0-7])\.b, vl256
380 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
381 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
382 **	st1w	\3\.d, \1, \[x8\]
383 **	ret
384 */
385 v32si
si_zip1_d(v32si x)386 si_zip1_d (v32si x)
387 {
388   return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
389 }
390 
391 /*
392 ** sf_zip1_d:
393 **	ptrue	(p[0-7])\.b, vl256
394 **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
395 **	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
396 **	st1w	\3\.d, \1, \[x8\]
397 **	ret
398 */
399 v32sf
sf_zip1_d(v32sf x)400 sf_zip1_d (v32sf x)
401 {
402   return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
403 }
404