1/*
2	synth_neon64_accurate: NEON optimized synth for AArch64 (MPEG compliant 16-bit output version)
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17maxmin_s16:
18	.word   1191181824
19	.word   -956301312
20	.text
21	ALIGN4
22	.globl ASM_NAME(synth_1to1_neon64_accurate_asm)
23#ifdef __ELF__
24	.type ASM_NAME(synth_1to1_neon64_accurate_asm), %function
25#endif
26ASM_NAME(synth_1to1_neon64_accurate_asm):
27	add		x0, x0, #64
28	sub		x0, x0, x3, lsl #2
29	eor		v31.16b, v31.16b, v31.16b
30	adrp	x5, AARCH64_PCREL_HI(maxmin_s16)
31	add		x5, x5, AARCH64_PCREL_LO(maxmin_s16)
32	ld2r	{v28.4s,v29.4s}, [x5]
33
34	mov		w4, #4
35	mov		x5, #128
361:
37	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
38	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
39	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
40	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
41
42	fmul	v24.4s, v0.4s, v16.4s
43	fmul	v25.4s, v4.4s, v20.4s
44	fmla	v24.4s, v1.4s, v17.4s
45	fmla	v25.4s, v5.4s, v21.4s
46	fmla	v24.4s, v2.4s, v18.4s
47	fmla	v25.4s, v6.4s, v22.4s
48	fmla	v24.4s, v3.4s, v19.4s
49	fmla	v25.4s, v7.4s, v23.4s
50
51	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
52	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
53	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
54	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
55
56	fmul	v26.4s, v0.4s, v16.4s
57	fmul	v27.4s, v4.4s, v20.4s
58	fmla	v26.4s, v1.4s, v17.4s
59	fmla	v27.4s, v5.4s, v21.4s
60	fmla	v26.4s, v2.4s, v18.4s
61	fmla	v27.4s, v6.4s, v22.4s
62	fmla	v26.4s, v3.4s, v19.4s
63	fmla	v27.4s, v7.4s, v23.4s
64
65	faddp	v0.4s, v24.4s, v25.4s
66	faddp	v1.4s, v26.4s, v27.4s
67	faddp	v0.4s, v0.4s, v1.4s
68	ld2		{v4.4h,v5.4h}, [x2]
69	fcvtns	v1.4s, v0.4s
70	fcmgt	v2.4s, v0.4s, v28.4s
71	fcmgt	v3.4s, v29.4s, v0.4s
72	sqxtn	v4.4h, v1.4s
73	add		v2.4s, v2.4s, v3.4s
74	add		v31.4s, v31.4s, v2.4s
75	st2		{v4.4h,v5.4h}, [x2], #16
76
77	subs	w4, w4, #1
78	b.ne	1b
79
80	mov		w4, #4
81	mov		x6, #-64
822:
83	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
84	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
85	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
86	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
87
88	fmul	v24.4s, v0.4s, v16.4s
89	fmul	v25.4s, v4.4s, v20.4s
90	fmla	v24.4s, v1.4s, v17.4s
91	fmla	v25.4s, v5.4s, v21.4s
92	fmla	v24.4s, v2.4s, v18.4s
93	fmla	v25.4s, v6.4s, v22.4s
94	fmla	v24.4s, v3.4s, v19.4s
95	fmla	v25.4s, v7.4s, v23.4s
96
97	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
98	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
99	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
100	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
101
102	fmul	v26.4s, v0.4s, v16.4s
103	fmul	v27.4s, v4.4s, v20.4s
104	fmla	v26.4s, v1.4s, v17.4s
105	fmla	v27.4s, v5.4s, v21.4s
106	fmla	v26.4s, v2.4s, v18.4s
107	fmla	v27.4s, v6.4s, v22.4s
108	fmla	v26.4s, v3.4s, v19.4s
109	fmla	v27.4s, v7.4s, v23.4s
110
111	faddp	v0.4s, v24.4s, v25.4s
112	faddp	v1.4s, v26.4s, v27.4s
113	faddp	v0.4s, v0.4s, v1.4s
114	ld2		{v4.4h,v5.4h}, [x2]
115	fcvtns	v1.4s, v0.4s
116	fcmgt	v2.4s, v0.4s, v28.4s
117	fcmgt	v3.4s, v29.4s, v0.4s
118	sqxtn	v4.4h, v1.4s
119	add		v2.4s, v2.4s, v3.4s
120	add		v31.4s, v31.4s, v2.4s
121	st2		{v4.4h,v5.4h}, [x2], #16
122
123	subs	w4, w4, #1
124	b.ne	2b
125
126	AARCH64_DUP_2D(v0, v31, 1)
127	add		v0.4s, v0.4s, v31.4s
128	AARCH64_DUP_4S(v1, v0, 1)
129	add		v0.4s, v0.4s, v1.4s
130	umov	w0, v0.s[0]
131	neg		w0, w0
132
133	ret
134
135NONEXEC_STACK
136