1/*
2	dct64_neon64: NEON optimized dct64 for AArch64
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17costab_neon_aarch64:
18	.word 1056974725
19	.word 1057056395
20	.word 1057223771
21	.word 1057485416
22	.word 1057855544
23	.word 1058356026
24	.word 1059019886
25	.word 1059897405
26	.word 1061067246
27	.word 1062657950
28	.word 1064892987
29	.word 1066774581
30	.word 1069414683
31	.word 1073984175
32	.word 1079645762
33	.word 1092815430
34	.word 1057005197
35	.word 1057342072
36	.word 1058087743
37	.word 1059427869
38	.word 1061799040
39	.word 1065862217
40	.word 1071413542
41	.word 1084439708
42	.word 1057128951
43	.word 1058664893
44	.word 1063675095
45	.word 1076102863
46	.word 1057655764
47	.word 1067924853
48	.word 1060439283
49	.word 1060439283
50	.text
51	ALIGN4
52	.globl ASM_NAME(dct64_neon64)
53#ifdef __ELF__
54	.type ASM_NAME(dct64_neon64), %function
55#endif
56ASM_NAME(dct64_neon64):
57	add		x3, x2, #64
58	adrp	x4, AARCH64_PCREL_HI(costab_neon_aarch64)
59	add		x4, x4, AARCH64_PCREL_LO(costab_neon_aarch64)
60	ld1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x2]
61	ld1		{v16.4s, v17.4s, v18.4s, v19.4s}, [x3]
62	ld1		{v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64
63
64	rev64	v19.4s, v19.4s
65	rev64	v18.4s, v18.4s
66	rev64	v17.4s, v17.4s
67	rev64	v16.4s, v16.4s
68	ext		v4.16b, v19.16b, v19.16b, #8
69	ext		v5.16b, v18.16b, v18.16b, #8
70	ext		v6.16b, v17.16b, v17.16b, #8
71	ext		v7.16b, v16.16b, v16.16b, #8
72
73	fsub	v16.4s, v3.4s, v7.4s
74	fsub	v17.4s, v2.4s, v6.4s
75	fsub	v18.4s, v1.4s, v5.4s
76	fsub	v19.4s, v0.4s, v4.4s
77	fadd	v0.4s, v0.4s, v4.4s		/* bs[0,1,2,3] */
78	fadd	v1.4s, v1.4s, v5.4s		/* bs[4,5,6,7] */
79	fadd	v2.4s, v2.4s, v6.4s		/* bs[8,9,10,11] */
80	fadd	v3.4s, v3.4s, v7.4s		/* bs[12,13,14,15] */
81	fmul	v16.4s, v16.4s, v23.4s	/* bs[19,18,17,16] */
82	fmul	v17.4s, v17.4s, v22.4s	/* bs[23,22,21,20] */
83	fmul	v18.4s, v18.4s, v21.4s	/* bs[27,26,25,24] */
84	fmul	v19.4s, v19.4s, v20.4s	/* bs[31,30,29,28] */
85
86	ld1		{v20.4s, v21.4s}, [x4], #32
87	rev64	v22.4s, v3.4s
88	rev64	v23.4s, v2.4s
89	rev64	v24.4s, v16.4s
90	rev64	v25.4s, v17.4s
91	ext		v4.16b, v22.16b, v22.16b, #8	/* bs[15,14,13,12] */
92	ext		v5.16b, v23.16b, v23.16b, #8	/* bs[11,10,9,8] */
93	ext		v6.16b, v24.16b, v24.16b, #8	/* bs[16,17,18,19] */
94	ext		v7.16b, v25.16b, v25.16b, #8	/* bs[20,21,22,23] */
95
96	fsub	v26.4s, v1.4s, v5.4s
97	fsub	v27.4s, v0.4s, v4.4s
98	fsub	v28.4s, v18.4s, v7.4s
99	fsub	v29.4s, v19.4s, v6.4s
100	fadd	v4.4s, v0.4s, v4.4s		/* bs[32,33,34,35] */
101	fadd	v5.4s, v1.4s, v5.4s		/* bs[36,37,38,39] */
102	fadd	v6.4s, v6.4s, v19.4s	/* bs[48,49,50,51] */
103	fadd	v7.4s, v7.4s, v18.4s	/* bs[52,53,54,55] */
104	fmul	v26.4s, v26.4s, v21.4s	/* bs[43,42,41,40] */
105	fmul	v27.4s, v27.4s, v20.4s	/* bs[47,46,45,44] */
106	fmul	v28.4s, v28.4s, v21.4s	/* bs[59,58,57,56] */
107	fmul	v29.4s, v29.4s, v20.4s	/* bs[63,62,61,60] */
108
109	ld1		{v20.4s}, [x4], #16
110	rev64	v16.4s, v5.4s
111	rev64	v17.4s, v26.4s
112	rev64	v18.4s, v7.4s
113	rev64	v19.4s, v28.4s
114	ext		v0.16b, v16.16b, v16.16b, #8	/* bs[39,38,37,36] */
115	ext		v1.16b, v17.16b, v17.16b, #8	/* bs[40,41,42,43] */
116	ext		v2.16b, v18.16b, v18.16b, #8	/* bs[55,54,53,52] */
117	ext		v3.16b, v19.16b, v19.16b, #8	/* bs[56,57,58,59] */
118
119	fsub	v16.4s, v4.4s, v0.4s
120	fsub	v17.4s, v27.4s, v1.4s
121	fsub	v18.4s, v6.4s, v2.4s
122	fsub	v19.4s, v29.4s, v3.4s
123	fadd	v0.4s, v4.4s, v0.4s		/* bs[0,1,2,3] */
124	fadd	v1.4s, v1.4s, v27.4s	/* bs[8,9,10,11] */
125	fadd	v2.4s, v6.4s, v2.4s		/* bs[16,17,18,19] */
126	fadd	v3.4s, v3.4s, v29.4s	/* bs[24,25,26,27] */
127	fmul	v16.4s, v16.4s, v20.4s	/* bs[7,6,5,4] */
128	fmul	v17.4s, v17.4s, v20.4s	/* bs[15,14,13,12] */
129	fmul	v18.4s, v18.4s, v20.4s	/* bs[23,22,21,20] */
130	fmul	v19.4s, v19.4s, v20.4s	/* bs[31,30,29,28] */
131
132	ld1		{v28.4s}, [x4]
133	zip1	v4.2d, v0.2d, v16.2d	/* bs[0,1,7,6] */
134	zip2	v5.2d, v0.2d, v16.2d	/* bs[2,3,5,4] */
135	zip1	v6.2d, v1.2d, v17.2d	/* bs[8,9,15,14] */
136	zip2	v7.2d, v1.2d, v17.2d	/* bs[10,11,13,12] */
137	zip1	v20.2d, v2.2d, v18.2d	/* bs[16,17,23,22] */
138	zip2	v21.2d, v2.2d, v18.2d	/* bs[18,19,21,20] */
139	zip1	v22.2d, v3.2d, v19.2d	/* bs[24,25,31,30] */
140	zip2	v23.2d, v3.2d, v19.2d	/* bs[26,27,29,28] */
141	rev64	v5.4s, v5.4s			/* bs[3,2,4,5] */
142	rev64	v7.4s, v7.4s			/* bs[11,10,12,13] */
143	rev64	v21.4s, v21.4s			/* bs[19,18,20,21] */
144	rev64	v23.4s, v23.4s			/* bs[27,26,28,29] */
145	AARCH64_DUP_2D(v29, v28, 0)
146	AARCH64_DUP_4S(v28, v28, 2)
147
148	fsub	v16.4s, v4.4s, v5.4s
149	fsub	v17.4s, v6.4s, v7.4s
150	fsub	v18.4s, v20.4s, v21.4s
151	fsub	v19.4s, v22.4s, v23.4s
152	fadd	v0.4s, v4.4s, v5.4s		/* bs[32,33,36,37] */
153	fadd	v1.4s, v6.4s, v7.4s		/* bs[40,41,44,45] */
154	fadd	v2.4s, v20.4s, v21.4s	/* bs[48,49,52,53] */
155	fadd	v3.4s, v22.4s, v23.4s	/* bs[56,57,60,61] */
156	fmul	v16.4s, v16.4s, v29.4s	/* bs[35,34,39,38] */
157	fmul	v17.4s, v17.4s, v29.4s	/* bs[43,42,47,46] */
158	fmul	v18.4s, v18.4s, v29.4s	/* bs[51,50,55,54] */
159	fmul	v19.4s, v19.4s, v29.4s	/* bs[59,58,63,62] */
160
161	uzp1	v4.4s, v0.4s, v16.4s	/* bs[32,36,35,39] */
162	uzp2	v5.4s, v0.4s, v16.4s	/* bs[33,37,34,38] */
163	uzp1	v6.4s, v1.4s, v17.4s	/* bs[40,44,43,47] */
164	uzp2	v7.4s, v1.4s, v17.4s	/* bs[41,45,42,46] */
165	uzp1	v20.4s, v2.4s, v18.4s	/* bs[48,52,51,55] */
166	uzp2	v21.4s, v2.4s, v18.4s	/* bs[49,53,50,54] */
167	uzp1	v22.4s, v3.4s, v19.4s	/* bs[56,60,59,63] */
168	uzp2	v23.4s, v3.4s, v19.4s	/* bs[57,61,58,62] */
169
170	fsub	v16.4s, v4.4s, v5.4s
171	fsub	v17.4s, v6.4s, v7.4s
172	fsub	v18.4s, v20.4s, v21.4s
173	fsub	v19.4s, v22.4s, v23.4s
174	fadd	v0.4s, v4.4s, v5.4s		/* bs[0,4,2,6] */
175	fadd	v1.4s, v6.4s, v7.4s		/* bs[8,12,10,14] */
176	fadd	v2.4s, v20.4s, v21.4s	/* bs[16,20,18,22] */
177	fadd	v3.4s, v22.4s, v23.4s	/* bs[24,28,26,30] */
178	fmul	v16.4s, v16.4s, v28.4s	/* bs[1,5,3,7] */
179	fmul	v17.4s, v17.4s, v28.4s	/* bs[9,13,11,15] */
180	fmul	v18.4s, v18.4s, v28.4s	/* bs[17,21,19,23] */
181	fmul	v19.4s, v19.4s, v28.4s	/* bs[25,29,27,31] */
182
183	zip2	v4.2d, v0.2d, v1.2d		/* bs[2,6,10,14] */
184	zip2	v5.2d, v16.2d, v17.2d	/* bs[3,7,11,15] */
185	zip2	v6.2d, v2.2d, v3.2d		/* bs[18,22,26,30] */
186	zip2	v7.2d, v18.2d, v19.2d	/* bs[19,23,27,31] */
187	fadd	v4.4s, v4.4s, v5.4s		/* bs[2,6,10,14] */
188	fadd	v6.4s, v6.4s, v7.4s		/* bs[18,22,26,30] */
189	ins		v0.d[1], v4.d[0]		/* bs[0,4,2,6] */
190	ins		v1.d[1], v4.d[1]		/* bs[8,12,10,14] */
191	ins		v2.d[1], v6.d[0]		/* bs[16,20,18,22] */
192	ins		v3.d[1], v6.d[1]		/* bs[24,28,26,30] */
193
194	eor		v31.16b, v31.16b, v31.16b
195	zip1	v4.4s, v0.4s, v16.4s	/* bs[0,1,4,5] */
196	zip2	v5.4s, v0.4s, v16.4s	/* bs[2,3,6,7] */
197	zip1	v6.4s, v1.4s, v17.4s	/* bs[8,9,12,13] */
198	zip2	v7.4s, v1.4s, v17.4s	/* bs[10,11,14,15] */
199	zip1	v20.4s, v2.4s, v18.4s	/* bs[16,17,20,21] */
200	zip2	v21.4s, v2.4s, v18.4s	/* bs[18,19,22,23] */
201	zip1	v22.4s, v3.4s, v19.4s	/* bs[24,25,28,29] */
202	zip2	v23.4s, v3.4s, v19.4s	/* bs[26,27,30,31] */
203	zip1	v0.2d, v4.2d, v5.2d		/* bs[0,1,2,3] */
204	zip2	v1.2d, v4.2d, v5.2d		/* bs[4,5,6,7] */
205	zip1	v2.2d, v6.2d, v7.2d		/* bs[8,9,10,11] */
206	zip2	v3.2d, v6.2d, v7.2d		/* bs[12,13,14,15] */
207	rev64	v16.4s, v4.4s
208	rev64	v17.4s,	v6.4s
209	zip1	v24.2d, v7.2d, v17.2d
210	zip2	v16.2d, v5.2d, v16.2d
211	zip2	v17.2d, v7.2d, v17.2d
212	zip1	v4.2d, v20.2d, v21.2d	/* bs[16,17,18,19] */
213	zip2	v5.2d, v20.2d, v21.2d	/* bs[20,21,22,23] */
214	zip1	v6.2d, v22.2d, v23.2d	/* bs[24,25,26,27] */
215	zip2	v7.2d, v22.2d, v23.2d	/* bs[28,29,30,31] */
216	rev64	v18.4s, v20.4s
217	rev64	v19.4s, v22.4s
218	zip1	v25.2d, v23.2d, v19.2d
219	zip1	v26.2d, v21.2d, v18.2d
220	zip2	v18.2d, v21.2d, v18.2d
221	zip2	v19.2d, v23.2d, v19.2d
222	ins		v16.s[3], v31.s[0]		/* bs[6,7,5,-] */
223	ins		v17.s[3], v31.s[0]		/* bs[14,15,13,-] */
224	ins		v18.s[3], v31.s[0]		/* bs[22,23,21,-] */
225	ins		v19.s[3], v31.s[0]		/* bs[30,31,29,-] */
226	ins		v24.s[3], v31.s[0]		/* bs[10,11,9,-] */
227	ins		v25.s[3], v31.s[0]		/* bs[26,27,25,-] */
228	ins		v26.s[3], v31.s[0]		/* bs[18,19,17,-] */
229
230	fadd	v1.4s, v1.4s, v16.4s
231	fadd	v3.4s, v3.4s, v17.4s
232	fadd	v5.4s, v5.4s, v18.4s
233	fadd	v7.4s, v7.4s, v19.4s
234
235	fadd	v2.4s, v2.4s, v3.4s
236	fadd	v3.4s, v3.4s, v24.4s
237	fadd	v6.4s, v6.4s, v7.4s
238	fadd	v7.4s, v7.4s, v25.4s
239
240	fadd	v4.4s, v4.4s, v6.4s
241	fadd	v6.4s, v6.4s, v5.4s
242	fadd	v5.4s, v5.4s, v7.4s
243	fadd	v7.4s, v7.4s, v26.4s
244
245	fcvtns	v0.4s, v0.4s
246	fcvtns	v1.4s, v1.4s
247	fcvtns	v2.4s, v2.4s
248	fcvtns	v3.4s, v3.4s
249	fcvtns	v4.4s, v4.4s
250	fcvtns	v5.4s, v5.4s
251	fcvtns	v6.4s, v6.4s
252	fcvtns	v7.4s, v7.4s
253	sqxtn	v0.4h, v0.4s
254	sqxtn	v1.4h, v1.4s
255	sqxtn	v2.4h, v2.4s
256	sqxtn	v3.4h, v3.4s
257	sqxtn	v4.4h, v4.4s
258	sqxtn	v5.4h, v5.4s
259	sqxtn	v6.4h, v6.4s
260	sqxtn	v7.4h, v7.4s
261
262	mov		x3, #32
263	st1		{v0.h}[1], [x0], x3
264	st1		{v7.h}[2], [x0], x3
265	st1		{v3.h}[2], [x0], x3
266	st1		{v5.h}[2], [x0], x3
267	st1		{v1.h}[2], [x0], x3
268	st1		{v6.h}[2], [x0], x3
269	st1		{v2.h}[2], [x0], x3
270	st1		{v4.h}[2], [x0], x3
271	st1		{v0.h}[2], [x0], x3
272	st1		{v7.h}[0], [x0], x3
273	st1		{v3.h}[0], [x0], x3
274	st1		{v5.h}[0], [x0], x3
275	st1		{v1.h}[0], [x0], x3
276	st1		{v6.h}[0], [x0], x3
277	st1		{v2.h}[0], [x0], x3
278	st1		{v4.h}[0], [x0], x3
279	st1		{v0.h}[0], [x0]
280	st1		{v0.h}[1], [x1], x3
281	st1		{v4.h}[1], [x1], x3
282	st1		{v2.h}[1], [x1], x3
283	st1		{v6.h}[1], [x1], x3
284	st1		{v1.h}[1], [x1], x3
285	st1		{v5.h}[1], [x1], x3
286	st1		{v3.h}[1], [x1], x3
287	st1		{v7.h}[1], [x1], x3
288	st1		{v0.h}[3], [x1], x3
289	st1		{v4.h}[3], [x1], x3
290	st1		{v2.h}[3], [x1], x3
291	st1		{v6.h}[3], [x1], x3
292	st1		{v1.h}[3], [x1], x3
293	st1		{v5.h}[3], [x1], x3
294	st1		{v3.h}[3], [x1], x3
295	st1		{v7.h}[3], [x1]
296
297	ret
298
299NONEXEC_STACK
300