1/*
2	synth_stereo_x86_64: SSE optimized synth for x86-64 (stereo specific version)
3
4	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifdef IS_MSABI
12/* short *window; */
13#define WINDOW %r10
14/* short *b0l; */
15#define B0L %rdx
16/* short *b0r; */
17#define B0R %r8
18/* short *samples; */
19#define SAMPLES %r9
20#else
21/* short *window; */
22#define WINDOW %rdi
23/* short *b0l; */
24#define B0L %rsi
25/* short *b0r; */
26#define B0R %rdx
27/* short *samples; */
28#define SAMPLES %r9
29#endif
30
31#define XMMREG_CLIP %xmm15
32#define XMMREG_MAX %xmm14 /* {32767, 32767, 32767, 32767} */
33#define XMMREG_MIN %xmm13 /* {-32769, -32769, -32769, -32769} : not -32768 because SSE doesn't have "less than" comparison... */
34#define XMMREG_FULL %xmm12 /* {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF} */
35
36/*
37	int synth_1to1_s_x86_64_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
38	return value: number of clipped samples
39*/
40
41#ifndef __APPLE__
42	.section	.rodata
43#else
44	.data
45#endif
46	ALIGN32
47ASM_NAME(maxmin_x86_64):
48	.long   32767
49	.long   32767
50	.long   32767
51	.long   32767
52	.long   -32769
53	.long   -32769
54	.long   -32769
55	.long   -32769
56	.text
57	ALIGN16
58.globl ASM_NAME(synth_1to1_s_x86_64_asm)
59ASM_NAME(synth_1to1_s_x86_64_asm):
60#ifdef IS_MSABI /* should save xmm6-15 */
61	movl		40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
62	subq		$168, %rsp /* stack alignment + 10 xmm registers */
63	movaps		%xmm6, (%rsp)
64	movaps		%xmm7, 16(%rsp)
65	movaps		%xmm8, 32(%rsp)
66	movaps		%xmm9, 48(%rsp)
67	movaps		%xmm10, 64(%rsp)
68	movaps		%xmm11, 80(%rsp)
69	movaps		%xmm12, 96(%rsp)
70	movaps		%xmm13, 112(%rsp)
71	movaps		%xmm14, 128(%rsp)
72	movaps		%xmm15, 144(%rsp)
73#endif
74
75#ifdef IS_MSABI
76	shlq		$32, %rax
77	shrq		$31, %rax
78	movq		%rcx, %r10
79#else
80	movq		%r8, %rax
81	shlq		$32, %rax
82	shrq		$31, %rax
83	movq		%rcx, %r9
84#endif
85	leaq		32(WINDOW), WINDOW
86	subq		%rax, WINDOW
87
88	leaq		ASM_NAME(maxmin_x86_64)(%rip), %rax
89	movaps		(%rax), XMMREG_MAX
90	movaps		16(%rax), XMMREG_MIN
91	pxor		XMMREG_CLIP, XMMREG_CLIP
92	pcmpeqd		XMMREG_FULL, XMMREG_FULL
93
94	movl		$4, %ecx
95
96	ALIGN16
971:
98	movups		(WINDOW), %xmm0
99	movups		16(WINDOW), %xmm1
100	movups		64(WINDOW), %xmm2
101	movups		80(WINDOW), %xmm3
102	movups		128(WINDOW), %xmm4
103	movups		144(WINDOW), %xmm5
104	movups		192(WINDOW), %xmm6
105	movups		208(WINDOW), %xmm7
106	movaps		%xmm0, %xmm8
107	movaps		%xmm1, %xmm9
108	movaps		%xmm2, %xmm10
109	movaps		%xmm3, %xmm11
110	pmaddwd		(B0L), %xmm0
111	pmaddwd		16(B0L), %xmm1
112	pmaddwd		32(B0L), %xmm2
113	pmaddwd		48(B0L), %xmm3
114	pmaddwd		(B0R), %xmm8
115	pmaddwd		16(B0R), %xmm9
116	pmaddwd		32(B0R), %xmm10
117	pmaddwd		48(B0R), %xmm11
118	paddd		%xmm1, %xmm0
119	paddd		%xmm3, %xmm2
120	paddd		%xmm9, %xmm8
121	paddd		%xmm11, %xmm10
122	movaps		%xmm4, %xmm1
123	movaps		%xmm5, %xmm9
124	movaps		%xmm6, %xmm3
125	movaps		%xmm7, %xmm11
126	pmaddwd		64(B0L), %xmm4
127	pmaddwd		80(B0L), %xmm5
128	pmaddwd		96(B0L), %xmm6
129	pmaddwd		112(B0L), %xmm7
130	pmaddwd		64(B0R), %xmm1
131	pmaddwd		80(B0R), %xmm9
132	pmaddwd		96(B0R), %xmm3
133	pmaddwd		112(B0R), %xmm11
134	paddd		%xmm5, %xmm4
135	paddd		%xmm7, %xmm6
136	paddd		%xmm1, %xmm9
137	paddd		%xmm3, %xmm11
138
139	movaps		%xmm0, %xmm1
140	movaps		%xmm4, %xmm3
141	movaps		%xmm8, %xmm5
142	movaps		%xmm9, %xmm7
143	punpckldq	%xmm2, %xmm0
144	punpckldq	%xmm6, %xmm4
145	punpckhdq	%xmm2, %xmm1
146	punpckhdq	%xmm6, %xmm3
147	punpckldq	%xmm10, %xmm8
148	punpckldq	%xmm11, %xmm9
149	punpckhdq	%xmm10, %xmm5
150	punpckhdq	%xmm11, %xmm7
151	movaps		%xmm0, %xmm2
152	movaps		%xmm1, %xmm6
153	movaps		%xmm8, %xmm10
154	movaps		%xmm5, %xmm11
155	movlhps		%xmm4, %xmm0
156	movhlps		%xmm2, %xmm4
157	movlhps		%xmm3, %xmm1
158	movhlps		%xmm6, %xmm3
159	movlhps		%xmm9, %xmm8
160	movhlps		%xmm10, %xmm9
161	movlhps		%xmm7, %xmm5
162	movhlps		%xmm11, %xmm7
163	paddd		%xmm4, %xmm0
164	paddd		%xmm3, %xmm1
165	paddd		%xmm9, %xmm8
166	paddd		%xmm7, %xmm5
167	paddd		%xmm1, %xmm0
168	paddd		%xmm5, %xmm8
169	psrad		$13, %xmm0
170	psrad		$13, %xmm8
171
172	movaps		%xmm0, %xmm1
173	movaps		%xmm0, %xmm2
174	movaps		%xmm0, %xmm3
175	movaps		%xmm8, %xmm4
176	punpckldq	%xmm8, %xmm0
177	punpckhdq	%xmm8, %xmm1
178	packssdw	%xmm1, %xmm0
179	movups		%xmm0, (SAMPLES)
180
181	pcmpgtd		XMMREG_MAX, %xmm2
182	pcmpgtd		XMMREG_MIN, %xmm3
183	pcmpgtd		XMMREG_MAX, %xmm4
184	pcmpgtd		XMMREG_MIN, %xmm8
185	packssdw	%xmm4, %xmm2
186	packssdw	%xmm8, %xmm3
187	pxor		XMMREG_FULL, %xmm3
188	psrlw		$15, %xmm2
189	psrlw		$15, %xmm3
190	paddw		%xmm3, %xmm2
191	paddw		%xmm2, XMMREG_CLIP
192
193	leaq		256(WINDOW), WINDOW
194	leaq		128(B0L), B0L
195	leaq		128(B0R), B0R
196	leaq		16(SAMPLES), SAMPLES
197
198	decl		%ecx
199	jnz			1b
200
201	movl		$4, %ecx
202
203	ALIGN16
2041:
205	movups		(WINDOW), %xmm0
206	movups		16(WINDOW), %xmm1
207	movups		64(WINDOW), %xmm2
208	movups		80(WINDOW), %xmm3
209	movups		128(WINDOW), %xmm4
210	movups		144(WINDOW), %xmm5
211	movups		192(WINDOW), %xmm6
212	movups		208(WINDOW), %xmm7
213	movaps		%xmm0, %xmm8
214	movaps		%xmm1, %xmm9
215	movaps		%xmm2, %xmm10
216	movaps		%xmm3, %xmm11
217	pmaddwd		(B0L), %xmm0
218	pmaddwd		16(B0L), %xmm1
219	pmaddwd		-32(B0L), %xmm2
220	pmaddwd		-16(B0L), %xmm3
221	pmaddwd		(B0R), %xmm8
222	pmaddwd		16(B0R), %xmm9
223	pmaddwd		-32(B0R), %xmm10
224	pmaddwd		-16(B0R), %xmm11
225	paddd		%xmm1, %xmm0
226	paddd		%xmm3, %xmm2
227	paddd		%xmm9, %xmm8
228	paddd		%xmm11, %xmm10
229	movaps		%xmm4, %xmm1
230	movaps		%xmm5, %xmm9
231	movaps		%xmm6, %xmm3
232	movaps		%xmm7, %xmm11
233	pmaddwd		-64(B0L), %xmm4
234	pmaddwd		-48(B0L), %xmm5
235	pmaddwd		-96(B0L), %xmm6
236	pmaddwd		-80(B0L), %xmm7
237	pmaddwd		-64(B0R), %xmm1
238	pmaddwd		-48(B0R), %xmm9
239	pmaddwd		-96(B0R), %xmm3
240	pmaddwd		-80(B0R), %xmm11
241	paddd		%xmm5, %xmm4
242	paddd		%xmm7, %xmm6
243	paddd		%xmm1, %xmm9
244	paddd		%xmm3, %xmm11
245
246	movaps		%xmm0, %xmm1
247	movaps		%xmm4, %xmm3
248	movaps		%xmm8, %xmm5
249	movaps		%xmm9, %xmm7
250	punpckldq	%xmm2, %xmm0
251	punpckldq	%xmm6, %xmm4
252	punpckhdq	%xmm2, %xmm1
253	punpckhdq	%xmm6, %xmm3
254	punpckldq	%xmm10, %xmm8
255	punpckldq	%xmm11, %xmm9
256	punpckhdq	%xmm10, %xmm5
257	punpckhdq	%xmm11, %xmm7
258	movaps		%xmm0, %xmm2
259	movaps		%xmm1, %xmm6
260	movaps		%xmm8, %xmm10
261	movaps		%xmm5, %xmm11
262	movlhps		%xmm4, %xmm0
263	movhlps		%xmm2, %xmm4
264	movlhps		%xmm3, %xmm1
265	movhlps		%xmm6, %xmm3
266	movlhps		%xmm9, %xmm8
267	movhlps		%xmm10, %xmm9
268	movlhps		%xmm7, %xmm5
269	movhlps		%xmm11, %xmm7
270	paddd		%xmm4, %xmm0
271	paddd		%xmm3, %xmm1
272	paddd		%xmm9, %xmm8
273	paddd		%xmm7, %xmm5
274	paddd		%xmm1, %xmm0
275	paddd		%xmm5, %xmm8
276	psrad		$13, %xmm0
277	psrad		$13, %xmm8
278
279	movaps		%xmm0, %xmm1
280	movaps		%xmm0, %xmm2
281	movaps		%xmm0, %xmm3
282	movaps		%xmm8, %xmm4
283	punpckldq	%xmm8, %xmm0
284	punpckhdq	%xmm8, %xmm1
285	packssdw	%xmm1, %xmm0
286	movups		%xmm0, (SAMPLES)
287
288	pcmpgtd		XMMREG_MAX, %xmm2
289	pcmpgtd		XMMREG_MIN, %xmm3
290	pcmpgtd		XMMREG_MAX, %xmm4
291	pcmpgtd		XMMREG_MIN, %xmm8
292	packssdw	%xmm4, %xmm2
293	packssdw	%xmm8, %xmm3
294	pxor		XMMREG_FULL, %xmm3
295	psrlw		$15, %xmm2
296	psrlw		$15, %xmm3
297	paddw		%xmm3, %xmm2
298	paddw		%xmm2, XMMREG_CLIP
299
300	leaq		256(WINDOW), WINDOW
301	leaq		-128(B0L), B0L
302	leaq		-128(B0R), B0R
303	leaq		16(SAMPLES), SAMPLES
304
305	decl		%ecx
306	jnz			1b
307
308	movhlps		XMMREG_CLIP, %xmm0
309	paddw		XMMREG_CLIP, %xmm0
310	pshuflw		$0x55, %xmm0, %xmm1
311	pshuflw		$0xaa, %xmm0, %xmm2
312	pshuflw		$0xff, %xmm0, %xmm3
313	paddw		%xmm1, %xmm0
314	paddw		%xmm2, %xmm0
315	paddw		%xmm3, %xmm0
316
317	movd		%xmm0, %eax
318	andl		$0xffff, %eax
319
320#ifdef IS_MSABI
321	movaps		(%rsp), %xmm6
322	movaps		16(%rsp), %xmm7
323	movaps		32(%rsp), %xmm8
324	movaps		48(%rsp), %xmm9
325	movaps		64(%rsp), %xmm10
326	movaps		80(%rsp), %xmm11
327	movaps		96(%rsp), %xmm12
328	movaps		112(%rsp), %xmm13
329	movaps		128(%rsp), %xmm14
330	movaps		144(%rsp), %xmm15
331	addq		$168, %rsp
332#endif
333	ret
334
335NONEXEC_STACK
336