1/*
2	synth_neon64: NEON optimized synth for AArch64
3
4	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifndef __APPLE__
12	.section	.rodata
13#else
14	.data
15#endif
16	ALIGN16
17maxmin_s16:
18	.word   32767
19	.word   -32768
20	.text
21	ALIGN4
22	.globl ASM_NAME(synth_1to1_neon64_asm)
23#ifdef __ELF__
24	.type ASM_NAME(synth_1to1_neon64_asm), %function
25#endif
26ASM_NAME(synth_1to1_neon64_asm):
27	add		x0, x0, #32
28	sub		x0, x0, x3, lsl #1
29	eor		v31.16b, v31.16b, v31.16b
30	adrp	x5, AARCH64_PCREL_HI(maxmin_s16)
31	add		x5, x5, AARCH64_PCREL_LO(maxmin_s16)
32	ld2r	{v28.4s,v29.4s}, [x5]
33
34	mov		w4, #4
35	mov		x5, #64
361:
37	ld1		{v0.8h,v1.8h}, [x0], x5
38	ld1		{v2.8h,v3.8h}, [x0], x5
39	ld1		{v4.8h,v5.8h}, [x0], x5
40	ld1		{v6.8h,v7.8h}, [x0], x5
41	ld1		{v16.8h,v17.8h,v18.8h,v19.8h}, [x1], #64
42	ld1		{v20.8h,v21.8h,v22.8h,v23.8h}, [x1], #64
43
44	smull	v24.4s, v0.4h, v16.4h
45	smull	v25.4s, v2.4h, v18.4h
46	smull	v26.4s, v4.4h, v20.4h
47	smull	v27.4s, v6.4h, v22.4h
48	smlal2	v24.4s, v0.8h, v16.8h
49	smlal2	v25.4s, v2.8h, v18.8h
50	smlal2	v26.4s, v4.8h, v20.8h
51	smlal2	v27.4s, v6.8h, v22.8h
52	smlal	v24.4s, v1.4h, v17.4h
53	smlal	v25.4s, v3.4h, v19.4h
54	smlal	v26.4s, v5.4h, v21.4h
55	smlal	v27.4s, v7.4h, v23.4h
56	smlal2	v24.4s, v1.8h, v17.8h
57	smlal2	v25.4s, v3.8h, v19.8h
58	smlal2	v26.4s, v5.8h, v21.8h
59	smlal2	v27.4s, v7.8h, v23.8h
60
61	addp	v0.4s, v24.4s, v25.4s
62	addp	v1.4s, v26.4s, v27.4s
63	addp	v0.4s, v0.4s, v1.4s
64	ld2		{v4.4h,v5.4h}, [x2]
65	sqrshrn	v4.4h, v0.4s, #13
66	cmgt	v2.4s, v0.4s, v28.4s
67	cmgt	v3.4s, v29.4s, v0.4s
68	add		v2.4s, v2.4s, v3.4s
69	add		v31.4s, v31.4s, v2.4s
70	st2		{v4.4h,v5.4h}, [x2], #16
71
72	subs	w4, w4, #1
73	b.ne	1b
74
75	mov		w4, #4
76	mov		x6, #-32
772:
78	ld1		{v0.8h,v1.8h}, [x0], x5
79	ld1		{v2.8h,v3.8h}, [x0], x5
80	ld1		{v4.8h,v5.8h}, [x0], x5
81	ld1		{v6.8h,v7.8h}, [x0], x5
82	ld1		{v16.8h,v17.8h}, [x1], x6
83	ld1		{v18.8h,v19.8h}, [x1], x6
84	ld1		{v20.8h,v21.8h}, [x1], x6
85	ld1		{v22.8h,v23.8h}, [x1], x6
86
87	smull	v24.4s, v0.4h, v16.4h
88	smull	v25.4s, v2.4h, v18.4h
89	smull	v26.4s, v4.4h, v20.4h
90	smull	v27.4s, v6.4h, v22.4h
91	smlal2	v24.4s, v0.8h, v16.8h
92	smlal2	v25.4s, v2.8h, v18.8h
93	smlal2	v26.4s, v4.8h, v20.8h
94	smlal2	v27.4s, v6.8h, v22.8h
95	smlal	v24.4s, v1.4h, v17.4h
96	smlal	v25.4s, v3.4h, v19.4h
97	smlal	v26.4s, v5.4h, v21.4h
98	smlal	v27.4s, v7.4h, v23.4h
99	smlal2	v24.4s, v1.8h, v17.8h
100	smlal2	v25.4s, v3.8h, v19.8h
101	smlal2	v26.4s, v5.8h, v21.8h
102	smlal2	v27.4s, v7.8h, v23.8h
103
104	addp	v0.4s, v24.4s, v25.4s
105	addp	v1.4s, v26.4s, v27.4s
106	addp	v0.4s, v0.4s, v1.4s
107	ld2		{v4.4h,v5.4h}, [x2]
108	sqrshrn	v4.4h, v0.4s, #13
109	cmgt	v2.4s, v0.4s, v28.4s
110	cmgt	v3.4s, v29.4s, v0.4s
111	add		v2.4s, v2.4s, v3.4s
112	add		v31.4s, v31.4s, v2.4s
113	st2		{v4.4h,v5.4h}, [x2], #16
114
115	subs	w4, w4, #1
116	b.ne	2b
117
118	AARCH64_DUP_2D(v0, v31, 1)
119	add		v0.4s, v0.4s, v31.4s
120	AARCH64_DUP_4S(v1, v0, 1)
121	add		v0.4s, v0.4s, v1.4s
122	umov	w0, v0.s[0]
123	neg		w0, w0
124
125	ret
126
127NONEXEC_STACK
128