1/*
2	synth_stereo_avx: AVX optimized synth for x86-64 (stereo specific version)
3
4	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifdef IS_MSABI
12/* short *window; */
13#define WINDOW %r10
14/* short *b0l; */
15#define B0L %rdx
16/* short *b0r; */
17#define B0R %r8
18/* short *samples; */
19#define SAMPLES %r9
20#else
21/* short *window; */
22#define WINDOW %rdi
23/* short *b0l; */
24#define B0L %rsi
25/* short *b0r; */
26#define B0R %rdx
27/* short *samples; */
28#define SAMPLES %r9
29#endif
30
31/*
32	int synth_1to1_s_avx_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
33	return value: number of clipped samples
34*/
35
36	ALIGN16
37.globl ASM_NAME(synth_1to1_s_avx_asm)
38ASM_NAME(synth_1to1_s_avx_asm):
39#ifdef IS_MSABI /* should save xmm6-15 */
40	push		%rbp
41	mov			%rsp, %rbp
42	sub			$144, %rsp
43	movaps		%xmm6, (%rsp)
44	movaps		%xmm7, 16(%rsp)
45	movaps		%xmm8, 32(%rsp)
46	movaps		%xmm9, 48(%rsp)
47	movaps		%xmm10, 64(%rsp)
48	movaps		%xmm11, 80(%rsp)
49	movaps		%xmm12, 96(%rsp)
50	movaps		%xmm13, 112(%rsp)
51	movaps		%xmm14, 128(%rsp)
52	movl		48(%rbp), %eax /* 5th argument; placed after 32-byte shadow space */
53#endif
54
55#ifdef IS_MSABI
56	shl			$1, %eax
57	mov			%rcx, WINDOW
58#else
59	mov			%r8d, %eax
60	shl			$1, %eax
61	movq		%rcx, SAMPLES
62#endif
63	add			$32, WINDOW
64	sub			%rax, WINDOW
65
66	mov			$64, %rax
67	movl		$4, %ecx
68	vpxor		%xmm14, %xmm14, %xmm14
69
70	ALIGN16
711:
72	movups		(WINDOW), %xmm8
73	movups		16(WINDOW), %xmm9
74	movups		(WINDOW,%rax), %xmm10
75	movups		16(WINDOW,%rax), %xmm11
76	vpmaddwd	(B0L), %xmm8, %xmm0
77	vpmaddwd	16(B0L), %xmm9, %xmm1
78	vpmaddwd	(B0R), %xmm8, %xmm2
79	vpmaddwd	16(B0R), %xmm9, %xmm3
80	vpmaddwd	32(B0L), %xmm10, %xmm4
81	vpmaddwd	48(B0L), %xmm11, %xmm5
82	vpmaddwd	32(B0R), %xmm10, %xmm6
83	vpmaddwd	48(B0R), %xmm11, %xmm7
84	vpaddd		%xmm1, %xmm0, %xmm8
85	vpaddd		%xmm3, %xmm2, %xmm0
86	vpaddd		%xmm5, %xmm4, %xmm9
87	vpaddd		%xmm7, %xmm6, %xmm1
88	lea			(WINDOW,%rax,2), WINDOW
89	add			%rax, B0L
90	add			%rax, B0R
91
92	movups		(WINDOW), %xmm10
93	movups		16(WINDOW), %xmm11
94	movups		(WINDOW,%rax), %xmm12
95	movups		16(WINDOW,%rax), %xmm13
96	vpmaddwd	(B0L), %xmm10, %xmm2
97	vpmaddwd	16(B0L), %xmm11, %xmm3
98	vpmaddwd	(B0R), %xmm10, %xmm4
99	vpmaddwd	16(B0R), %xmm11, %xmm5
100	vpmaddwd	32(B0L), %xmm12, %xmm6
101	vpmaddwd	48(B0L), %xmm13, %xmm10
102	vpmaddwd	32(B0R), %xmm12, %xmm7
103	vpmaddwd	48(B0R), %xmm13, %xmm11
104	vpaddd		%xmm3, %xmm2, %xmm2
105	vpaddd		%xmm5, %xmm4, %xmm3
106	vpaddd		%xmm6, %xmm10, %xmm4
107	vpaddd		%xmm7, %xmm11, %xmm5
108	lea			(WINDOW,%rax,2), WINDOW
109	add			%rax, B0L
110	add			%rax, B0R
111
112	vpunpckldq	%xmm0, %xmm8, %xmm6
113	vpunpckhdq	%xmm0, %xmm8, %xmm0
114	vpunpckldq	%xmm1, %xmm9, %xmm7
115	vpunpckhdq	%xmm1, %xmm9, %xmm1
116	vpaddd		%xmm6, %xmm0, %xmm0
117	vpaddd		%xmm7, %xmm1, %xmm1
118	vpunpckldq	%xmm3, %xmm2, %xmm6
119	vpunpckhdq	%xmm3, %xmm2, %xmm2
120	vpunpckldq	%xmm5, %xmm4, %xmm7
121	vpunpckhdq	%xmm5, %xmm4, %xmm3
122	vpaddd		%xmm6, %xmm2, %xmm2
123	vpaddd		%xmm7, %xmm3, %xmm3
124
125	vpunpcklqdq	%xmm1, %xmm0, %xmm4
126	vpunpckhqdq	%xmm1, %xmm0, %xmm0
127	vpunpcklqdq	%xmm3, %xmm2, %xmm5
128	vpunpckhqdq	%xmm3, %xmm2, %xmm1
129	vpaddd		%xmm0, %xmm4, %xmm0
130	vpaddd		%xmm1, %xmm5, %xmm1
131	vpsrad		$13, %xmm0, %xmm0
132	vpsrad		$13, %xmm1, %xmm1
133	vpackssdw	%xmm1, %xmm0, %xmm2
134	vpcmpeqd	%xmm3, %xmm3, %xmm3
135	vpslld		$16, %xmm0, %xmm0
136	vpslld		$16, %xmm1, %xmm1
137	vpsrld		$16, %xmm0, %xmm0
138	vpsrld		$16, %xmm1, %xmm1
139	vpackusdw	%xmm1, %xmm0, %xmm0
140	vpcmpeqw	%xmm2, %xmm0, %xmm0
141	vpxor		%xmm3, %xmm0, %xmm0
142	vpaddw		%xmm0, %xmm14, %xmm14
143
144	movups		%xmm2, (SAMPLES)
145	add			$16, SAMPLES
146	dec			%ecx
147	jnz			1b
148
149	movl		$4, %ecx
150
151	ALIGN16
1521:
153	movups		(WINDOW), %xmm8
154	movups		16(WINDOW), %xmm9
155	movups		(WINDOW,%rax), %xmm10
156	movups		16(WINDOW,%rax), %xmm11
157	vpmaddwd	(B0L), %xmm8, %xmm0
158	vpmaddwd	16(B0L), %xmm9, %xmm1
159	vpmaddwd	(B0R), %xmm8, %xmm2
160	vpmaddwd	16(B0R), %xmm9, %xmm3
161	vpmaddwd	-32(B0L), %xmm10, %xmm4
162	vpmaddwd	-16(B0L), %xmm11, %xmm5
163	vpmaddwd	-32(B0R), %xmm10, %xmm6
164	vpmaddwd	-16(B0R), %xmm11, %xmm7
165	vpaddd		%xmm1, %xmm0, %xmm8
166	vpaddd		%xmm3, %xmm2, %xmm0
167	vpaddd		%xmm5, %xmm4, %xmm9
168	vpaddd		%xmm7, %xmm6, %xmm1
169	lea			(WINDOW,%rax,2), WINDOW
170	sub			%rax, B0L
171	sub			%rax, B0R
172
173	movups		(WINDOW), %xmm10
174	movups		16(WINDOW), %xmm11
175	movups		(WINDOW,%rax), %xmm12
176	movups		16(WINDOW,%rax), %xmm13
177	vpmaddwd	(B0L), %xmm10, %xmm2
178	vpmaddwd	16(B0L), %xmm11, %xmm3
179	vpmaddwd	(B0R), %xmm10, %xmm4
180	vpmaddwd	16(B0R), %xmm11, %xmm5
181	vpmaddwd	-32(B0L), %xmm12, %xmm6
182	vpmaddwd	-16(B0L), %xmm13, %xmm10
183	vpmaddwd	-32(B0R), %xmm12, %xmm7
184	vpmaddwd	-16(B0R), %xmm13, %xmm11
185	vpaddd		%xmm3, %xmm2, %xmm2
186	vpaddd		%xmm5, %xmm4, %xmm3
187	vpaddd		%xmm6, %xmm10, %xmm4
188	vpaddd		%xmm7, %xmm11, %xmm5
189	lea			(WINDOW,%rax,2), WINDOW
190	sub			%rax, B0L
191	sub			%rax, B0R
192
193	vpunpckldq	%xmm0, %xmm8, %xmm6
194	vpunpckhdq	%xmm0, %xmm8, %xmm0
195	vpunpckldq	%xmm1, %xmm9, %xmm7
196	vpunpckhdq	%xmm1, %xmm9, %xmm1
197	vpaddd		%xmm6, %xmm0, %xmm0
198	vpaddd		%xmm7, %xmm1, %xmm1
199	vpunpckldq	%xmm3, %xmm2, %xmm6
200	vpunpckhdq	%xmm3, %xmm2, %xmm2
201	vpunpckldq	%xmm5, %xmm4, %xmm7
202	vpunpckhdq	%xmm5, %xmm4, %xmm3
203	vpaddd		%xmm6, %xmm2, %xmm2
204	vpaddd		%xmm7, %xmm3, %xmm3
205
206	vpunpcklqdq	%xmm1, %xmm0, %xmm4
207	vpunpckhqdq	%xmm1, %xmm0, %xmm0
208	vpunpcklqdq	%xmm3, %xmm2, %xmm5
209	vpunpckhqdq	%xmm3, %xmm2, %xmm1
210	vpaddd		%xmm0, %xmm4, %xmm0
211	vpaddd		%xmm1, %xmm5, %xmm1
212	vpsrad		$13, %xmm0, %xmm0
213	vpsrad		$13, %xmm1, %xmm1
214	vpackssdw	%xmm1, %xmm0, %xmm2
215	vpcmpeqd	%xmm3, %xmm3, %xmm3
216	vpslld		$16, %xmm0, %xmm0
217	vpslld		$16, %xmm1, %xmm1
218	vpsrld		$16, %xmm0, %xmm0
219	vpsrld		$16, %xmm1, %xmm1
220	vpackusdw	%xmm1, %xmm0, %xmm0
221	vpcmpeqw	%xmm2, %xmm0, %xmm0
222	vpxor		%xmm3, %xmm0, %xmm0
223	vpaddw		%xmm0, %xmm14, %xmm14
224
225	movups		%xmm2, (SAMPLES)
226	add			$16, SAMPLES
227	dec			%ecx
228	jnz			1b
229
230	pxor		%xmm1, %xmm1
231	psubw		%xmm14, %xmm1
232	pshufd		$0x4e, %xmm1, %xmm0
233	paddw		%xmm1, %xmm0
234	pshuflw		$0x4e, %xmm0, %xmm1
235	paddw		%xmm1, %xmm0
236	pshuflw		$0x11, %xmm0, %xmm1
237	paddw		%xmm1, %xmm0
238	movd		%xmm0, %eax
239	and			$0x7f, %eax
240
241#ifdef IS_MSABI
242	movaps		(%rsp), %xmm6
243	movaps		16(%rsp), %xmm7
244	movaps		32(%rsp), %xmm8
245	movaps		48(%rsp), %xmm9
246	movaps		64(%rsp), %xmm10
247	movaps		80(%rsp), %xmm11
248	movaps		96(%rsp), %xmm12
249	movaps		112(%rsp), %xmm13
250	movaps		128(%rsp), %xmm14
251	mov			%rbp, %rsp
252	pop			%rbp
253#endif
254	ret
255
256NONEXEC_STACK
257