1/*
2	synth_neon64_s32: NEON optimized synth for AArch64 (32-bit output version)
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17maxmin_s32:
18	.word   1191182335
19	.word   -956301312
20	.word   1199570944
21	.text
22	ALIGN4
23	.globl ASM_NAME(synth_1to1_s32_neon64_asm)
24#ifdef __ELF__
25	.type ASM_NAME(synth_1to1_s32_neon64_asm), %function
26#endif
27ASM_NAME(synth_1to1_s32_neon64_asm):
28	add		x0, x0, #64
29	sub		x0, x0, x3, lsl #2
30	eor		v31.16b, v31.16b, v31.16b
31	adrp	x5, AARCH64_PCREL_HI(maxmin_s32)
32	add		x5, x5, AARCH64_PCREL_LO(maxmin_s32)
33	ld3r	{v28.4s,v29.4s,v30.4s}, [x5]
34
35	mov		w4, #4
36	mov		x5, #128
371:
38	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
39	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
40	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
41	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
42
43	fmul	v24.4s, v0.4s, v16.4s
44	fmul	v25.4s, v4.4s, v20.4s
45	fmla	v24.4s, v1.4s, v17.4s
46	fmla	v25.4s, v5.4s, v21.4s
47	fmla	v24.4s, v2.4s, v18.4s
48	fmla	v25.4s, v6.4s, v22.4s
49	fmla	v24.4s, v3.4s, v19.4s
50	fmla	v25.4s, v7.4s, v23.4s
51
52	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
53	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
54	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
55	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
56
57	fmul	v26.4s, v0.4s, v16.4s
58	fmul	v27.4s, v4.4s, v20.4s
59	fmla	v26.4s, v1.4s, v17.4s
60	fmla	v27.4s, v5.4s, v21.4s
61	fmla	v26.4s, v2.4s, v18.4s
62	fmla	v27.4s, v6.4s, v22.4s
63	fmla	v26.4s, v3.4s, v19.4s
64	fmla	v27.4s, v7.4s, v23.4s
65
66	faddp	v0.4s, v24.4s, v25.4s
67	faddp	v1.4s, v26.4s, v27.4s
68	faddp	v0.4s, v0.4s, v1.4s
69	fmul	v1.4s, v0.4s, v30.4s
70	ld2		{v4.4s,v5.4s}, [x2]
71	fcvtns	v4.4s, v1.4s
72	fcmgt	v2.4s, v0.4s, v28.4s
73	fcmgt	v3.4s, v29.4s, v0.4s
74	add		v2.4s, v2.4s, v3.4s
75	add		v31.4s, v31.4s, v2.4s
76	st2		{v4.4s,v5.4s}, [x2], #32
77
78	subs	w4, w4, #1
79	b.ne	1b
80
81	mov		w4, #4
82	mov		x6, #-64
832:
84	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
85	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
86	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
87	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
88
89	fmul	v24.4s, v0.4s, v16.4s
90	fmul	v25.4s, v4.4s, v20.4s
91	fmla	v24.4s, v1.4s, v17.4s
92	fmla	v25.4s, v5.4s, v21.4s
93	fmla	v24.4s, v2.4s, v18.4s
94	fmla	v25.4s, v6.4s, v22.4s
95	fmla	v24.4s, v3.4s, v19.4s
96	fmla	v25.4s, v7.4s, v23.4s
97
98	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
99	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
100	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
101	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
102
103	fmul	v26.4s, v0.4s, v16.4s
104	fmul	v27.4s, v4.4s, v20.4s
105	fmla	v26.4s, v1.4s, v17.4s
106	fmla	v27.4s, v5.4s, v21.4s
107	fmla	v26.4s, v2.4s, v18.4s
108	fmla	v27.4s, v6.4s, v22.4s
109	fmla	v26.4s, v3.4s, v19.4s
110	fmla	v27.4s, v7.4s, v23.4s
111
112	faddp	v0.4s, v24.4s, v25.4s
113	faddp	v1.4s, v26.4s, v27.4s
114	faddp	v0.4s, v0.4s, v1.4s
115	fmul	v1.4s, v0.4s, v30.4s
116	ld2		{v4.4s,v5.4s}, [x2]
117	fcvtns	v4.4s, v1.4s
118	fcmgt	v2.4s, v0.4s, v28.4s
119	fcmgt	v3.4s, v29.4s, v0.4s
120	add		v2.4s, v2.4s, v3.4s
121	add		v31.4s, v31.4s, v2.4s
122	st2		{v4.4s,v5.4s}, [x2], #32
123
124	subs	w4, w4, #1
125	b.ne	2b
126
127	AARCH64_DUP_2D(v0, v31, 1)
128	add		v0.4s, v0.4s, v31.4s
129	AARCH64_DUP_4S(v1, v0, 1)
130	add		v0.4s, v0.4s, v1.4s
131	umov	w0, v0.s[0]
132	neg		w0, w0
133
134	ret
135
136NONEXEC_STACK
137