1/*
2	synth_stereo_neon64_float: NEON optimized synth for AArch64 (stereo specific, float output version)
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17scale:
18	.word   939524096
19	.text
20	ALIGN4
21	.globl ASM_NAME(synth_1to1_real_s_neon64_asm)
22#ifdef __ELF__
23	.type ASM_NAME(synth_1to1_real_s_neon64_asm), %function
24#endif
25ASM_NAME(synth_1to1_real_s_neon64_asm):
26	add		x0, x0, #64
27	sub		x0, x0, x4, lsl #2
28	adrp	x5, AARCH64_PCREL_HI(scale)
29	add		x5, x5, AARCH64_PCREL_LO(scale)
30	ld1r	{v28.4s}, [x5]
31	sub		sp, sp, #32
32	st1		{v8.2s,v9.2s,v10.2s,v11.2s}, [sp]
33	sub		sp, sp, #32
34	st1		{v12.2s,v13.2s,v14.2s,v15.2s}, [sp]
35
36	mov		w4, #4
37	mov		x5, #128
381:
39	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
40	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
41	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
42	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
43	ld1		{v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64
44	ld1		{v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64
45
46	fmul	v24.4s, v0.4s, v16.4s
47	fmul	v25.4s, v0.4s, v20.4s
48	fmul	v26.4s, v4.4s, v8.4s
49	fmul	v27.4s, v4.4s, v12.4s
50	fmla	v24.4s, v1.4s, v17.4s
51	fmla	v25.4s, v1.4s, v21.4s
52	fmla	v26.4s, v5.4s, v9.4s
53	fmla	v27.4s, v5.4s, v13.4s
54	fmla	v24.4s, v2.4s, v18.4s
55	fmla	v25.4s, v2.4s, v22.4s
56	fmla	v26.4s, v6.4s, v10.4s
57	fmla	v27.4s, v6.4s, v14.4s
58	fmla	v24.4s, v3.4s, v19.4s
59	fmla	v25.4s, v3.4s, v23.4s
60	fmla	v26.4s, v7.4s, v11.4s
61	fmla	v27.4s, v7.4s, v15.4s
62
63	faddp	v0.4s, v24.4s, v25.4s
64	faddp	v1.4s, v26.4s, v27.4s
65	faddp	v0.4s, v0.4s, v1.4s
66	fmul	v0.4s, v0.4s, v28.4s
67	st1		{v0.4s}, [x3], #16
68
69	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
70	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
71	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
72	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
73	ld1		{v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64
74	ld1		{v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64
75
76	fmul	v24.4s, v0.4s, v16.4s
77	fmul	v25.4s, v0.4s, v20.4s
78	fmul	v26.4s, v4.4s, v8.4s
79	fmul	v27.4s, v4.4s, v12.4s
80	fmla	v24.4s, v1.4s, v17.4s
81	fmla	v25.4s, v1.4s, v21.4s
82	fmla	v26.4s, v5.4s, v9.4s
83	fmla	v27.4s, v5.4s, v13.4s
84	fmla	v24.4s, v2.4s, v18.4s
85	fmla	v25.4s, v2.4s, v22.4s
86	fmla	v26.4s, v6.4s, v10.4s
87	fmla	v27.4s, v6.4s, v14.4s
88	fmla	v24.4s, v3.4s, v19.4s
89	fmla	v25.4s, v3.4s, v23.4s
90	fmla	v26.4s, v7.4s, v11.4s
91	fmla	v27.4s, v7.4s, v15.4s
92
93	faddp	v0.4s, v24.4s, v25.4s
94	faddp	v1.4s, v26.4s, v27.4s
95	faddp	v0.4s, v0.4s, v1.4s
96	fmul	v0.4s, v0.4s, v28.4s
97	st1		{v0.4s}, [x3], #16
98
99	subs	w4, w4, #1
100	b.ne	1b
101
102	mov		w4, #4
103	mov		x6, #-64
1042:
105	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
106	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
107	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
108	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6
109	ld1		{v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6
110	ld1		{v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6
111
112	fmul	v24.4s, v0.4s, v16.4s
113	fmul	v25.4s, v0.4s, v20.4s
114	fmul	v26.4s, v4.4s, v8.4s
115	fmul	v27.4s, v4.4s, v12.4s
116	fmla	v24.4s, v1.4s, v17.4s
117	fmla	v25.4s, v1.4s, v21.4s
118	fmla	v26.4s, v5.4s, v9.4s
119	fmla	v27.4s, v5.4s, v13.4s
120	fmla	v24.4s, v2.4s, v18.4s
121	fmla	v25.4s, v2.4s, v22.4s
122	fmla	v26.4s, v6.4s, v10.4s
123	fmla	v27.4s, v6.4s, v14.4s
124	fmla	v24.4s, v3.4s, v19.4s
125	fmla	v25.4s, v3.4s, v23.4s
126	fmla	v26.4s, v7.4s, v11.4s
127	fmla	v27.4s, v7.4s, v15.4s
128
129	faddp	v0.4s, v24.4s, v25.4s
130	faddp	v1.4s, v26.4s, v27.4s
131	faddp	v0.4s, v0.4s, v1.4s
132	fmul	v0.4s, v0.4s, v28.4s
133	st1		{v0.4s}, [x3], #16
134
135	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
136	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
137	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
138	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6
139	ld1		{v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6
140	ld1		{v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6
141
142	fmul	v24.4s, v0.4s, v16.4s
143	fmul	v25.4s, v0.4s, v20.4s
144	fmul	v26.4s, v4.4s, v8.4s
145	fmul	v27.4s, v4.4s, v12.4s
146	fmla	v24.4s, v1.4s, v17.4s
147	fmla	v25.4s, v1.4s, v21.4s
148	fmla	v26.4s, v5.4s, v9.4s
149	fmla	v27.4s, v5.4s, v13.4s
150	fmla	v24.4s, v2.4s, v18.4s
151	fmla	v25.4s, v2.4s, v22.4s
152	fmla	v26.4s, v6.4s, v10.4s
153	fmla	v27.4s, v6.4s, v14.4s
154	fmla	v24.4s, v3.4s, v19.4s
155	fmla	v25.4s, v3.4s, v23.4s
156	fmla	v26.4s, v7.4s, v11.4s
157	fmla	v27.4s, v7.4s, v15.4s
158
159	faddp	v0.4s, v24.4s, v25.4s
160	faddp	v1.4s, v26.4s, v27.4s
161	faddp	v0.4s, v0.4s, v1.4s
162	fmul	v0.4s, v0.4s, v28.4s
163	st1		{v0.4s}, [x3], #16
164
165	subs	w4, w4, #1
166	b.ne	2b
167
168	eor		w0, w0, w0
169	ld1		{v12.2s,v13.2s,v14.2s,v15.2s}, [sp], #32
170	ld1		{v8.2s,v9.2s,v10.2s,v11.2s}, [sp], #32
171
172	ret
173
174NONEXEC_STACK
175