1/*
2	dct36_neon64: NEON optimized dct36 for AArch64
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17dct36_aarch64_COS9:
18	.word 0x3f5db3d7
19	.word 0x3f5db3d7
20	.word 0x3f000000
21	.word 0x3f000000
22	.word 0x3f7c1c5c
23	.word 0x3f7c1c5c
24	.word 0x3f708fb2
25	.word 0x3f708fb2
26	.word 0x3f248dbb
27	.word 0x3f248dbb
28	.word 0x3e31d0d4
29	.word 0x3e31d0d4
30	.word 0x3eaf1d44
31	.word 0x3eaf1d44
32	.word 0x3f441b7d
33	.word 0x3f441b7d
34	.word 0x3f007d2b
35	.word 0x3f0483ee
36	.word 0x3f0d3b7d
37	.word 0x3f1c4257
38	.word 0x40b79454
39	.word 0x3ff746ea
40	.word 0x3f976fd9
41	.word 0x3f5f2944
42	.word 0x3f800000
43	.word 0x3f3504f3
44
45	.text
46	ALIGN4
47	.globl ASM_NAME(dct36_neon64)
48#ifdef __ELF__
49	.type ASM_NAME(dct36_neon64), %function
50#endif
51ASM_NAME(dct36_neon64):
52	adrp		x5, AARCH64_PCREL_HI(dct36_aarch64_COS9)
53	add			x5, x5, AARCH64_PCREL_LO(dct36_aarch64_COS9)
54	cmeq		v28.16b, v28.16b, v28.16b
55	eor			v29.16b, v29.16b, v29.16b
56	shl			v28.2d, v28.2d, #32
57	ld1			{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], #64
58	ld1			{v4.2s}, [x0]
59
60	ext			v16.16b, v29.16b, v0.16b, #12
61	ext			v17.16b, v0.16b, v1.16b, #12
62	ext			v18.16b, v1.16b, v2.16b, #12
63	ext			v19.16b, v2.16b, v3.16b, #12
64	ext			v20.16b, v3.16b, v4.16b, #12
65	fadd		v0.4s, v0.4s, v16.4s
66	fadd		v1.4s, v1.4s, v17.4s
67	fadd		v2.4s, v2.4s, v18.4s
68	fadd		v3.4s, v3.4s, v19.4s
69	fadd		v4.2s, v4.2s, v20.2s
70
71	ext			v16.16b, v0.16b, v1.16b, #8
72	ext			v17.16b, v1.16b, v2.16b, #8
73	ext			v18.16b, v2.16b, v3.16b, #8
74	ext			v19.16b, v3.16b, v4.16b, #8
75	and			v20.16b, v0.16b, v28.16b
76	ext			v0.16b, v29.16b, v0.16b, #8
77	and			v21.16b, v1.16b, v28.16b
78	and			v22.16b, v2.16b, v28.16b
79	and			v23.16b, v3.16b, v28.16b
80	fadd		v1.4s, v20.4s, v16.4s
81	fadd		v2.4s, v21.4s, v17.4s
82	fadd		v3.4s, v22.4s, v18.4s
83	fadd		v4.4s, v23.4s, v19.4s
84
85/*
86v0 in[-,-,0,1]
87v1 in[2,3,4,5]
88v2 in[6,7,8,9]
89v3 in[10,11,12,13]
90v4 in[14,15,16,17]
91*/
92
93	orr			v5.16b, v2.16b, v2.16b
94	ins			v2.d[1], v3.d[1]
95	ins			v3.d[1], v4.d[1]
96	ins			v4.d[1], v5.d[1]
97
98/*
99v2 in[6,7,12,13]
100v3 in[10,11,16,17]
101v4 in[14,15,8,9]
102*/
103
104	ld1			{v16.4s,v17.4s,v18.4s,v19.4s}, [x5], #64
105	orr			v20.16b, v0.16b, v0.16b
106	fmla		v20.4s, v2.4s, v16.4s
107
108/*
109v17 COS9_[1,1,2,2]
110v18 COS9_[5,5,8,8]
111v19 COS9_[7,7,4,4]
112v16 COS9_[3,3,6,6]
113v20 [ta33,tb33,ta66,tb66]
114*/
115
116	orr			v21.16b, v20.16b, v20.16b
117	orr			v23.16b, v20.16b, v20.16b
118	zip2		v25.2d, v29.2d, v2.2d
119	fsub		v22.4s, v1.4s, v3.4s
120	fmul		v24.4s, v1.4s, v17.4s
121	fmul		v26.4s, v1.4s, v18.4s
122	fmul		v27.4s, v1.4s, v19.4s
123	fmla		v21.4s, v3.4s, v18.4s
124	fmla		v23.4s, v3.4s, v19.4s
125	fmla		v20.4s, v4.4s, v18.4s
126	fsub		v25.4s, v0.4s, v25.4s
127	fsub		v22.4s, v22.4s, v4.4s
128	fmla		v24.4s, v4.4s, v19.4s
129	fmla		v26.4s, v4.4s, v17.4s
130	fmla		v27.4s, v3.4s, v17.4s
131	fmla		v25.4s, v22.4s, v16.4s
132	fadd		v24.4s, v24.4s, v21.4s
133	fsub		v26.4s, v26.4s, v23.4s
134	fsub		v27.4s, v27.4s, v20.4s
135
136	zip1		v16.4s, v24.4s, v25.4s
137	zip2		v17.4s, v24.4s, v25.4s
138	zip1		v18.4s, v26.4s, v27.4s
139	zip2		v19.4s, v26.4s, v27.4s
140	fneg		v19.4s, v19.4s
141	zip1		v20.2d, v16.2d, v18.2d
142	zip1		v21.2d, v17.2d, v19.2d
143	zip2		v22.2d, v16.2d, v18.2d
144	zip2		v23.2d, v17.2d, v19.2d
145
146	ld1			{v5.4s,v6.4s}, [x5], #32
147	ld1			{v7.2s}, [x5]
148	fsub		v0.4s, v0.4s, v1.4s
149	fsub		v4.4s, v4.4s, v2.4s
150	fadd		v17.4s, v22.4s, v23.4s
151	fsub		v19.4s, v23.4s, v22.4s
152	fadd		v0.4s, v0.4s, v3.4s
153	fadd		v16.4s, v20.4s, v21.4s
154	fsub		v18.4s, v21.4s, v20.4s
155	fadd		v0.4s, v0.4s, v4.4s
156	fmul		v17.4s, v17.4s, v5.4s
157	fmul		v19.4s, v19.4s, v6.4s
158	AARCH64_DUP_2D(v0, v0, 1)
159	fmul		v0.2s, v0.2s, v7.2s
160
161/*
162v16 tmp[0,1,2,3]
163v17 tmp[17,16,15,14]
164v18 tmp[8,7,6,5]
165v19 tmp[9,10,11,12]
166v0 tmp[4,13]
167*/
168
169	add			x0, x4, #640
170	add			x5, x3, #20
171	add			x6, x3, #92
172	add			x7, x1, #20
173	ld1			{v1.4s,v2.4s}, [x5]
174	ld1			{v3.4s,v4.4s}, [x6]
175	ld1			{v5.4s,v6.4s}, [x7]
176	fadd		v20.4s, v16.4s, v17.4s
177	fsub		v21.4s, v16.4s, v17.4s
178	fmul		v4.4s, v20.4s, v4.4s
179	fmla		v6.4s, v21.4s, v2.4s
180	rev64		v20.4s, v20.4s
181	rev64		v21.4s, v21.4s
182	ext			v20.16b, v20.16b, v20.16b, #8
183	ext			v21.16b, v21.16b, v21.16b, #8
184	fmul		v3.4s, v20.4s, v3.4s
185	fmla		v5.4s, v21.4s, v1.4s
186	add			x5, x2, #20
187	mov			x9, #128
188	st1			{v3.4s,v4.4s}, [x5]
189	st1			{v5.s}[0], [x0], x9
190	st1			{v5.s}[1], [x0], x9
191	st1			{v5.s}[2], [x0], x9
192	st1			{v5.s}[3], [x0], x9
193	st1			{v6.s}[0], [x0], x9
194	st1			{v6.s}[1], [x0], x9
195	st1			{v6.s}[2], [x0], x9
196	st1			{v6.s}[3], [x0], x9
197
198	add			x0, x4, #1792
199	add			x5, x3, #56
200	add			x6, x3, #128
201	add			x7, x1, #56
202	ld1			{v1.4s}, [x3]
203	ld1			{v2.4s,v3.4s}, [x5]
204	ld1			{v4.4s}, [x6]
205	ld1			{v5.4s}, [x1]
206	ld1			{v6.4s}, [x7]
207	fadd		v20.4s, v18.4s, v19.4s
208	fsub		v21.4s, v18.4s, v19.4s
209	fmul		v3.4s, v20.4s, v3.4s
210	fmla		v5.4s, v21.4s, v1.4s
211	rev64		v20.4s, v20.4s
212	rev64		v21.4s, v21.4s
213	ext			v20.16b, v20.16b, v20.16b, #8
214	ext			v21.16b, v21.16b, v21.16b, #8
215	fmul		v4.4s, v20.4s, v4.4s
216	fmla		v6.4s, v21.4s, v2.4s
217	add			x5, x2, #56
218	st1			{v3.4s}, [x2]
219	st1			{v4.4s}, [x5]
220	st1			{v5.s}[0], [x4], x9
221	st1			{v5.s}[1], [x4], x9
222	st1			{v5.s}[2], [x4], x9
223	st1			{v5.s}[3], [x4], x9
224	st1			{v6.s}[0], [x0], x9
225	st1			{v6.s}[1], [x0], x9
226	st1			{v6.s}[2], [x0], x9
227	st1			{v6.s}[3], [x0], x9
228
229	ins			v1.s[0], v0.s[1]
230	ldr			s2, [x3, #16]
231	ldr			s3, [x3, #52]
232	ldr			s4, [x3, #88]
233	ldr			s5, [x3, #124]
234	ldr			s6, [x1, #16]
235	ldr			s7, [x1, #52]
236	fadd		s16, s0, s1
237	fsub		s17, s0, s1
238	fmul		s4, s16, s4
239	fmul		s5, s16, s5
240	fmadd		s6, s17, s2, s6
241	fmadd		s7, s17, s3, s7
242	str			s4, [x2, #16]
243	str			s5, [x2, #52]
244	str			s6, [x4]
245	str			s7, [x4, #1152]
246
247	ret
248
249NONEXEC_STACK
250