1/*
2	synth_stereo_x86_64_accurate: SSE optimized synth for x86-64 (stereo specific, MPEG-compliant 16bit output version)
3
4	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#ifdef IS_MSABI
12/* short *window; */
13#define WINDOW %rsi
14/* short *b0l; */
15#define B0L %rdx
16/* short *b0r; */
17#define B0R %r8
18/* short *samples; */
19#define SAMPLES %r9
20#else
21/* real *window; */
22#define WINDOW %rdi
23/* real *b0l; */
24#define B0L %rsi
25/* real *b0r; */
26#define B0R %rdx
27/* real *samples; */
28#define SAMPLES %r8
29#endif
30
31#define XMMREG_MAX (%r10)  /* {32767.0, 32767.0, 32767.0, 32767.0} */
32#define XMMREG_MIN (%r11)  /* {-32768.0, -32768.0, -32768.0, -32768.0} */
33#define TEMP_CLIP (%rsp)
34
35/*
36	int synth_1to1_s_x86_64_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1);
37	return value: number of clipped samples
38*/
39
40#ifndef __APPLE__
41	.section	.rodata
42#else
43	.data
44#endif
45	ALIGN32
46ASM_NAME(maxmin_s16):
47	.long   1191181824
48	.long   1191181824
49	.long   1191181824
50	.long   1191181824
51	.long   -956301312
52	.long   -956301312
53	.long   -956301312
54	.long   -956301312
55	.text
56	ALIGN16
57.globl ASM_NAME(synth_1to1_s_x86_64_accurate_asm)
58ASM_NAME(synth_1to1_s_x86_64_accurate_asm):
59#ifdef IS_MSABI /* should save xmm6-15 */
60	movl		40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
61	pushq		%rsi
62	subq		$176, %rsp /* 10 xmm registers + temp */
63	movaps		%xmm6, 16(%rsp)
64	movaps		%xmm7, 32(%rsp)
65	movaps		%xmm8, 48(%rsp)
66	movaps		%xmm9, 64(%rsp)
67	movaps		%xmm10, 80(%rsp)
68	movaps		%xmm11, 96(%rsp)
69	movaps		%xmm12, 112(%rsp)
70	movaps		%xmm13, 128(%rsp)
71	movaps		%xmm14, 144(%rsp)
72	movaps		%xmm15, 160(%rsp)
73#else
74	subq		$24, %rsp /* stack alignment + temp */
75#endif
76
77	leaq		ASM_NAME(maxmin_s16)(%rip), %r10
78	leaq		16(%r10), %r11
79	xorps		%xmm0, %xmm0
80	movaps		%xmm0, TEMP_CLIP
81
82#ifdef IS_MSABI
83	shlq		$32, %rax
84	shrq		$30, %rax
85	movq		%rcx, %rsi
86#else
87	movq		%r8, %rax
88	shlq		$32, %rax
89	shrq		$30, %rax
90	movq		%rcx, %r8
91#endif
92	leaq		64(WINDOW), WINDOW
93	subq		%rax, WINDOW
94
95	movl		$4, %ecx
96
97	ALIGN16
981:
99	movups		(WINDOW), %xmm8
100	movups		16(WINDOW), %xmm1
101	movups		32(WINDOW), %xmm2
102	movups		48(WINDOW), %xmm3
103	movups		128(WINDOW), %xmm9
104	movups		144(WINDOW), %xmm5
105	movups		160(WINDOW), %xmm6
106	movups		176(WINDOW), %xmm7
107	movaps		%xmm8, %xmm0
108	movaps		%xmm1, %xmm4
109	movaps		%xmm2, %xmm10
110	movaps		%xmm3, %xmm11
111	movaps		%xmm9, %xmm12
112	movaps		%xmm5, %xmm13
113	movaps		%xmm6, %xmm14
114	movaps		%xmm7, %xmm15
115	mulps		(B0L), %xmm8
116	mulps		16(B0L), %xmm1
117	mulps		32(B0L), %xmm2
118	mulps		48(B0L), %xmm3
119	mulps		64(B0L), %xmm9
120	mulps		80(B0L), %xmm5
121	mulps		96(B0L), %xmm6
122	mulps		112(B0L), %xmm7
123	mulps		(B0R), %xmm0
124	mulps		16(B0R), %xmm4
125	mulps		32(B0R), %xmm10
126	mulps		48(B0R), %xmm11
127	mulps		64(B0R), %xmm12
128	mulps		80(B0R), %xmm13
129	mulps		96(B0R), %xmm14
130	mulps		112(B0R), %xmm15
131
132	addps		%xmm1, %xmm8
133	addps		%xmm2, %xmm3
134	addps		%xmm4, %xmm0
135	addps		%xmm11, %xmm10
136	addps		%xmm5, %xmm9
137	addps		%xmm7, %xmm6
138	addps		%xmm13, %xmm12
139	addps		%xmm15, %xmm14
140	addps		%xmm3, %xmm8
141	addps		%xmm6, %xmm9
142	addps		%xmm10, %xmm0
143	addps		%xmm12, %xmm14
144	movaps		%xmm0, %xmm12
145	movaps		%xmm14, %xmm13
146	leaq		256(WINDOW), WINDOW
147	leaq		128(B0L), B0L
148	leaq		128(B0R), B0R
149
150	movups		(WINDOW), %xmm10
151	movups		16(WINDOW), %xmm1
152	movups		32(WINDOW), %xmm2
153	movups		48(WINDOW), %xmm3
154	movups		128(WINDOW), %xmm11
155	movups		144(WINDOW), %xmm5
156	movups		160(WINDOW), %xmm6
157	movups		176(WINDOW), %xmm7
158	movaps		%xmm10, %xmm0
159	movaps		%xmm1, %xmm4
160	movaps		%xmm2, %xmm14
161	movaps		%xmm3, %xmm15
162	mulps		(B0L), %xmm10
163	mulps		16(B0L), %xmm1
164	mulps		32(B0L), %xmm2
165	mulps		48(B0L), %xmm3
166	mulps		(B0R), %xmm0
167	mulps		16(B0R), %xmm4
168	mulps		32(B0R), %xmm14
169	mulps		48(B0R), %xmm15
170	addps		%xmm1, %xmm10
171	addps		%xmm2, %xmm3
172	addps		%xmm4, %xmm0
173	addps		%xmm15, %xmm14
174	movaps		%xmm11, %xmm1
175	movaps		%xmm5, %xmm2
176	movaps		%xmm6, %xmm4
177	movaps		%xmm7, %xmm15
178	mulps		64(B0L), %xmm11
179	mulps		80(B0L), %xmm5
180	mulps		96(B0L), %xmm6
181	mulps		112(B0L), %xmm7
182	mulps		64(B0R), %xmm1
183	mulps		80(B0R), %xmm2
184	mulps		96(B0R), %xmm4
185	mulps		112(B0R), %xmm15
186	addps		%xmm5, %xmm11
187	addps		%xmm7, %xmm6
188	addps		%xmm2, %xmm1
189	addps		%xmm15, %xmm4
190
191	addps		%xmm3, %xmm10
192	addps		%xmm6, %xmm11
193	addps		%xmm0, %xmm14
194	addps		%xmm4, %xmm1
195	movaps		%xmm1, %xmm15
196	leaq		256(WINDOW), WINDOW
197	leaq		128(B0L), B0L
198	leaq		128(B0R), B0R
199
200	movaps		%xmm8, %xmm0
201	movaps		%xmm10, %xmm1
202	movaps		%xmm12, %xmm4
203	movaps		%xmm14, %xmm5
204	unpcklps	%xmm9, %xmm8
205	unpcklps	%xmm11, %xmm10
206	unpckhps	%xmm9, %xmm0
207	unpckhps	%xmm11, %xmm1
208	unpcklps	%xmm13, %xmm12
209	unpcklps	%xmm15, %xmm14
210	unpckhps	%xmm13, %xmm4
211	unpckhps	%xmm15, %xmm5
212	movaps		%xmm8, %xmm2
213	movaps		%xmm0, %xmm3
214	movaps		%xmm12, %xmm6
215	movaps		%xmm4, %xmm7
216	movlhps		%xmm10, %xmm8
217	movhlps		%xmm2, %xmm10
218	movlhps		%xmm1, %xmm0
219	movhlps		%xmm3, %xmm1
220	movlhps		%xmm14, %xmm12
221	movhlps		%xmm6, %xmm14
222	movlhps		%xmm5, %xmm4
223	movhlps		%xmm7, %xmm5
224	subps		%xmm10, %xmm8
225	subps		%xmm1, %xmm0
226	subps		%xmm14, %xmm12
227	subps		%xmm5, %xmm4
228	addps		%xmm8, %xmm0
229	addps		%xmm12, %xmm4
230
231	movaps		%xmm0, %xmm2
232	movaps		%xmm0, %xmm3
233	movaps		%xmm4, %xmm5
234	movaps		%xmm4, %xmm6
235	cmpnleps	XMMREG_MAX, %xmm2
236	cmpltps		XMMREG_MIN, %xmm3
237	cmpnleps	XMMREG_MAX, %xmm5
238	cmpltps		XMMREG_MIN, %xmm6
239	cvtps2dq	%xmm0, %xmm0
240	cvtps2dq	%xmm4, %xmm4
241	movaps		%xmm0, %xmm1
242	unpcklps	%xmm4, %xmm0
243	unpckhps	%xmm4, %xmm1
244	packssdw	%xmm1, %xmm0
245	movups		%xmm0, (SAMPLES)
246
247	packssdw	%xmm5, %xmm2
248	packssdw	%xmm6, %xmm3
249	psrlw		$15, %xmm2
250	psrlw		$15, %xmm3
251	paddw		%xmm3, %xmm2
252	paddw		TEMP_CLIP, %xmm2
253	movaps		%xmm2, TEMP_CLIP
254
255	leaq		16(SAMPLES), SAMPLES
256	decl		%ecx
257	jnz			1b
258
259	movl		$4, %ecx
260
261	ALIGN16
2621:
263	movups		(WINDOW), %xmm8
264	movups		16(WINDOW), %xmm1
265	movups		32(WINDOW), %xmm2
266	movups		48(WINDOW), %xmm3
267	movups		128(WINDOW), %xmm9
268	movups		144(WINDOW), %xmm5
269	movups		160(WINDOW), %xmm6
270	movups		176(WINDOW), %xmm7
271	movaps		%xmm8, %xmm0
272	movaps		%xmm1, %xmm4
273	movaps		%xmm2, %xmm10
274	movaps		%xmm3, %xmm11
275	movaps		%xmm9, %xmm12
276	movaps		%xmm5, %xmm13
277	movaps		%xmm6, %xmm14
278	movaps		%xmm7, %xmm15
279	mulps		(B0L), %xmm8
280	mulps		16(B0L), %xmm1
281	mulps		32(B0L), %xmm2
282	mulps		48(B0L), %xmm3
283	mulps		-64(B0L), %xmm9
284	mulps		-48(B0L), %xmm5
285	mulps		-32(B0L), %xmm6
286	mulps		-16(B0L), %xmm7
287	mulps		(B0R), %xmm0
288	mulps		16(B0R), %xmm4
289	mulps		32(B0R), %xmm10
290	mulps		48(B0R), %xmm11
291	mulps		-64(B0R), %xmm12
292	mulps		-48(B0R), %xmm13
293	mulps		-32(B0R), %xmm14
294	mulps		-16(B0R), %xmm15
295
296	addps		%xmm1, %xmm8
297	addps		%xmm2, %xmm3
298	addps		%xmm4, %xmm0
299	addps		%xmm11, %xmm10
300	addps		%xmm5, %xmm9
301	addps		%xmm7, %xmm6
302	addps		%xmm13, %xmm12
303	addps		%xmm15, %xmm14
304	addps		%xmm3, %xmm8
305	addps		%xmm6, %xmm9
306	addps		%xmm10, %xmm0
307	addps		%xmm12, %xmm14
308	movaps		%xmm0, %xmm12
309	movaps		%xmm14, %xmm13
310	leaq		256(WINDOW), WINDOW
311	leaq		-128(B0L), B0L
312	leaq		-128(B0R), B0R
313
314	movups		(WINDOW), %xmm10
315	movups		16(WINDOW), %xmm1
316	movups		32(WINDOW), %xmm2
317	movups		48(WINDOW), %xmm3
318	movups		128(WINDOW), %xmm11
319	movups		144(WINDOW), %xmm5
320	movups		160(WINDOW), %xmm6
321	movups		176(WINDOW), %xmm7
322	movaps		%xmm10, %xmm0
323	movaps		%xmm1, %xmm4
324	movaps		%xmm2, %xmm14
325	movaps		%xmm3, %xmm15
326	mulps		(B0L), %xmm10
327	mulps		16(B0L), %xmm1
328	mulps		32(B0L), %xmm2
329	mulps		48(B0L), %xmm3
330	mulps		(B0R), %xmm0
331	mulps		16(B0R), %xmm4
332	mulps		32(B0R), %xmm14
333	mulps		48(B0R), %xmm15
334	addps		%xmm1, %xmm10
335	addps		%xmm2, %xmm3
336	addps		%xmm4, %xmm0
337	addps		%xmm15, %xmm14
338	movaps		%xmm11, %xmm1
339	movaps		%xmm5, %xmm2
340	movaps		%xmm6, %xmm4
341	movaps		%xmm7, %xmm15
342	mulps		-64(B0L), %xmm11
343	mulps		-48(B0L), %xmm5
344	mulps		-32(B0L), %xmm6
345	mulps		-16(B0L), %xmm7
346	mulps		-64(B0R), %xmm1
347	mulps		-48(B0R), %xmm2
348	mulps		-32(B0R), %xmm4
349	mulps		-16(B0R), %xmm15
350	addps		%xmm5, %xmm11
351	addps		%xmm7, %xmm6
352	addps		%xmm2, %xmm1
353	addps		%xmm15, %xmm4
354
355	addps		%xmm3, %xmm10
356	addps		%xmm6, %xmm11
357	addps		%xmm0, %xmm14
358	addps		%xmm4, %xmm1
359	movaps		%xmm1, %xmm15
360	leaq		256(WINDOW), WINDOW
361	leaq		-128(B0L), B0L
362	leaq		-128(B0R), B0R
363
364	movaps		%xmm8, %xmm0
365	movaps		%xmm10, %xmm1
366	movaps		%xmm12, %xmm4
367	movaps		%xmm14, %xmm5
368	unpcklps	%xmm9, %xmm8
369	unpcklps	%xmm11, %xmm10
370	unpckhps	%xmm9, %xmm0
371	unpckhps	%xmm11, %xmm1
372	unpcklps	%xmm13, %xmm12
373	unpcklps	%xmm15, %xmm14
374	unpckhps	%xmm13, %xmm4
375	unpckhps	%xmm15, %xmm5
376	movaps		%xmm8, %xmm2
377	movaps		%xmm0, %xmm3
378	movaps		%xmm12, %xmm6
379	movaps		%xmm4, %xmm7
380	movlhps		%xmm10, %xmm8
381	movhlps		%xmm2, %xmm10
382	movlhps		%xmm1, %xmm0
383	movhlps		%xmm3, %xmm1
384	movlhps		%xmm14, %xmm12
385	movhlps		%xmm6, %xmm14
386	movlhps		%xmm5, %xmm4
387	movhlps		%xmm7, %xmm5
388	addps		%xmm10, %xmm8
389	addps		%xmm1, %xmm0
390	addps		%xmm14, %xmm12
391	addps		%xmm5, %xmm4
392	addps		%xmm8, %xmm0
393	addps		%xmm12, %xmm4
394
395	movaps		%xmm0, %xmm2
396	movaps		%xmm0, %xmm3
397	movaps		%xmm4, %xmm5
398	movaps		%xmm4, %xmm6
399	cmpnleps	XMMREG_MAX, %xmm2
400	cmpltps		XMMREG_MIN, %xmm3
401	cmpnleps	XMMREG_MAX, %xmm5
402	cmpltps		XMMREG_MIN, %xmm6
403	cvtps2dq	%xmm0, %xmm0
404	cvtps2dq	%xmm4, %xmm4
405	movaps		%xmm0, %xmm1
406	unpcklps	%xmm4, %xmm0
407	unpckhps	%xmm4, %xmm1
408	packssdw	%xmm1, %xmm0
409	movups		%xmm0, (SAMPLES)
410
411	packssdw	%xmm5, %xmm2
412	packssdw	%xmm6, %xmm3
413	psrlw		$15, %xmm2
414	psrlw		$15, %xmm3
415	paddw		%xmm3, %xmm2
416	paddw		TEMP_CLIP, %xmm2
417	movaps		%xmm2, TEMP_CLIP
418
419	leaq		16(SAMPLES), SAMPLES
420	decl		%ecx
421	jnz			1b
422
423	movaps		TEMP_CLIP, %xmm4
424	movhlps		%xmm4, %xmm0
425	paddw		%xmm4, %xmm0
426	pshuflw		$0x55, %xmm0, %xmm1
427	pshuflw		$0xaa, %xmm0, %xmm2
428	pshuflw		$0xff, %xmm0, %xmm3
429	paddw		%xmm1, %xmm0
430	paddw		%xmm2, %xmm0
431	paddw		%xmm3, %xmm0
432
433	movd		%xmm0, %eax
434	andl		$0xffff, %eax
435
436#ifdef IS_MSABI
437	movaps		16(%rsp), %xmm6
438	movaps		32(%rsp), %xmm7
439	movaps		48(%rsp), %xmm8
440	movaps		64(%rsp), %xmm9
441	movaps		80(%rsp), %xmm10
442	movaps		96(%rsp), %xmm11
443	movaps		112(%rsp), %xmm12
444	movaps		128(%rsp), %xmm13
445	movaps		144(%rsp), %xmm14
446	movaps		160(%rsp), %xmm15
447	addq		$176, %rsp
448	popq		%rsi
449#else
450	addq		$24, %rsp
451#endif
452	ret
453
454NONEXEC_STACK
455