1/*
2	dct36_sse: SSE optimized dct36
3
4	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
5	see COPYING and AUTHORS files in distribution or http://mpg123.org
6	initially written by Taihei Monma
7*/
8
9#include "mangle.h"
10
11#define in %edi
12#define out1 %edi
13#define out2 %edx
14#define w  %ecx
15#define ts %eax
16#define tmp %esi
17
18/*
19	void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
20*/
21
22#ifndef __APPLE__
23	.section	.rodata
24#else
25	.data
26#endif
27	ALIGN16
28dct36_sse_COS9:
29	.long 0x3f5db3d7
30	.long 0x3f5db3d7
31	.long 0x3f000000
32	.long 0x3f000000
33	.long 0x3f7c1c5c
34	.long 0x3f7c1c5c
35	.long 0x3f708fb2
36	.long 0x3f708fb2
37	.long 0x3f248dbb
38	.long 0x3f248dbb
39	.long 0x3e31d0d4
40	.long 0x3e31d0d4
41	.long 0x3eaf1d44
42	.long 0x3eaf1d44
43	.long 0x3f441b7d
44	.long 0x3f441b7d
45	ALIGN16
46dct36_sse_tfcos36:
47	.long 0x3f007d2b
48	.long 0x3f0483ee
49	.long 0x3f0d3b7d
50	.long 0x3f1c4257
51	.long 0x40b79454
52	.long 0x3ff746ea
53	.long 0x3f976fd9
54	.long 0x3f5f2944
55	.long 0x3f3504f3
56	ALIGN16
57dct36_sse_mask:
58	.long 0,0xffffffff,0,0xffffffff
59	ALIGN16
60dct36_sse_sign:
61	.long 0x80000000,0x80000000,0x80000000,0x80000000
62	.text
63	ALIGN16
64	.globl ASM_NAME(dct36_sse)
65ASM_NAME(dct36_sse):
66	push		%ebp
67	mov			%esp, %ebp
68	and			$-16, %esp
69	sub			$80, %esp
70	push		%ebx
71	push		%esi
72	push		%edi
73	lea			12(%esp), tmp
74	movl		8(%ebp), in
75
76	GET_GOT
77
78	lea			LOCAL_VAR(dct36_sse_COS9), %eax
79	lea			LOCAL_VAR(dct36_sse_tfcos36), %edx
80
81	xorps		%xmm0, %xmm0
82	xorps		%xmm5, %xmm5
83	movlps		64(in), %xmm5
84	movups		48(in), %xmm4
85	movups		32(in), %xmm3
86	movups		16(in), %xmm2
87	movups		(in), %xmm1
88	movaps		%xmm5, %xmm6
89	shufps		$0xe1, %xmm6, %xmm6
90	movaps		%xmm4, %xmm7
91	shufps		$0x93, %xmm7, %xmm7
92	movss		%xmm7, %xmm6
93	addps		%xmm6, %xmm5
94	movaps		%xmm3, %xmm6
95	shufps		$0x93, %xmm6, %xmm6
96	movss		%xmm6, %xmm7
97	addps		%xmm7, %xmm4
98	movaps		%xmm2, %xmm7
99	shufps		$0x93, %xmm7, %xmm7
100	movss		%xmm7, %xmm6
101	addps		%xmm6, %xmm3
102	movaps		%xmm1, %xmm6
103	shufps		$0x93, %xmm6, %xmm6
104	movss		%xmm6, %xmm7
105	addps		%xmm7, %xmm2
106	movss		%xmm0, %xmm6
107	addps		%xmm6, %xmm1
108
109	movaps		LOCAL_VAR(dct36_sse_mask), %xmm0
110	movaps		%xmm4, %xmm6
111	shufps		$0x4e, %xmm5, %xmm4
112	movaps		%xmm3, %xmm7
113	shufps		$0x4e, %xmm6, %xmm3
114	andps		%xmm0, %xmm6
115	addps		%xmm6, %xmm4
116	movaps		%xmm2, %xmm6
117	shufps		$0x4e, %xmm7, %xmm2
118	andps		%xmm0, %xmm7
119	addps		%xmm7, %xmm3
120	movaps		%xmm1, %xmm7
121	shufps		$0x4e, %xmm6, %xmm1
122	andps		%xmm0, %xmm6
123	addps		%xmm6, %xmm2
124	movaps		%xmm7, %xmm6
125	andps		%xmm0, %xmm7
126	xorps		%xmm0, %xmm0
127	addps		%xmm7, %xmm1
128	movlhps		%xmm6, %xmm0
129
130/*
131xmm0 in[-,-,0,1]
132xmm1 in[2,3,4,5]
133xmm2 in[6,7,8,9]
134xmm3 in[10,11,12,13]
135xmm4 in[14,15,16,17]
136*/
137
138	movaps		%xmm2, %xmm5
139	shufps		$0xe4, %xmm3, %xmm5
140	shufps		$0xe4, %xmm4, %xmm3
141	shufps		$0xe4, %xmm2, %xmm4
142	movaps		%xmm5, %xmm2
143
144/*
145xmm2 in[6,7,12,13]
146xmm3 in[10,11,16,17]
147xmm4 in[14,15,8,9]
148*/
149
150	mulps		(%eax), %xmm5
151	addps		%xmm0, %xmm5
152
153	movaps		%xmm0, (tmp)
154	movaps		%xmm2, 16(tmp)
155
156/*
1570(tmp) in[-,-,0,1]
158xmm5 [ta33,tb33,ta66,tb66]
159*/
160
161	movaps		%xmm1, %xmm6
162	subps		%xmm3, %xmm6
163	subps		%xmm4, %xmm6
164	xorps		%xmm7, %xmm7
165	shufps		$0xe0, %xmm2, %xmm7
166	mulps		(%eax), %xmm6
167	subps		%xmm7, %xmm0
168	addps		%xmm0, %xmm6
169	movaps		%xmm6, 48(tmp)
170
171	movaps		16(%eax), %xmm2
172
173	movaps		%xmm1, %xmm0
174	movaps		%xmm3, %xmm6
175	movaps		%xmm4, %xmm7
176	mulps		%xmm2, %xmm0
177	mulps		32(%eax), %xmm6
178	mulps		48(%eax), %xmm7
179	addps		%xmm5, %xmm0
180	addps		%xmm7, %xmm6
181	addps		%xmm6, %xmm0
182	movaps		%xmm0, 32(tmp)
183
184	movaps		%xmm1, %xmm0
185	movaps		%xmm3, %xmm6
186	movaps		%xmm4, %xmm7
187	mulps		32(%eax), %xmm0
188	mulps		48(%eax), %xmm6
189	mulps		%xmm2, %xmm7
190	subps		%xmm5, %xmm0
191	subps		%xmm6, %xmm7
192	addps		%xmm7, %xmm0
193	movaps		%xmm0, 64(tmp)
194
195	movaps		%xmm1, %xmm6
196	movaps		%xmm4, %xmm7
197	mulps		48(%eax), %xmm6
198	mulps		%xmm3, %xmm2
199	mulps		32(%eax), %xmm7
200	subps		%xmm5, %xmm6
201	subps		%xmm7, %xmm2
202	addps		%xmm2, %xmm6
203
204	movaps		(tmp), %xmm0
205	movss		32(%edx), %xmm5
206	subps		%xmm1, %xmm0
207	subps		16(tmp), %xmm4
208	addps		%xmm3, %xmm0
209	addps		%xmm4, %xmm0
210	shufps		$0xaf, %xmm0, %xmm0
211	mulss		%xmm5, %xmm0
212	movaps		%xmm0, (tmp)
213
214	movaps		32(tmp), %xmm0
215	movaps		48(tmp), %xmm1
216	movaps		64(tmp), %xmm2
217
218/*
219xmm0 [1a-0,1b-0, 2a-0, 2b-0]
220xmm1 [1a-1,1b-1, 2a-1, 2b-1]
221xmm2 [1a-2,1b-2,-2a-2,-2b-2]
222xmm6 [1a-3,1b-3,-2a-3,-2b-3]
223*/
224
225	movaps		%xmm0, %xmm3
226	unpcklps	%xmm1, %xmm0
227	unpckhps	%xmm1, %xmm3
228	movaps		%xmm2, %xmm5
229	unpcklps	%xmm6, %xmm2
230	unpckhps	%xmm6, %xmm5
231	xorps		LOCAL_VAR(dct36_sse_sign), %xmm5
232
233/*
234xmm0 [1a-0,1a-1,1b-0,1b-1]
235xmm3 [2a-0,2a-1,2b-0,2b-1]
236xmm2 [1a-2,1a-3,1b-2,1b-3]
237xmm5 [2a-2,2a-3,2b-2,2b-3]
238*/
239
240	movaps		%xmm0, %xmm1
241	movlhps		%xmm2, %xmm0
242	movhlps		%xmm1, %xmm2
243	movaps		%xmm3, %xmm4
244	movlhps		%xmm5, %xmm3
245	movhlps		%xmm4, %xmm5
246
247/*
248xmm0 tmp1a
249xmm3 tmp2a
250xmm2 tmp1b
251xmm5 tmp2b
252*/
253
254	movaps		(%edx), %xmm6
255	movaps		16(%edx), %xmm7
256	movaps		%xmm5, %xmm1
257	addps		%xmm2, %xmm5
258	subps		%xmm2, %xmm1
259	movaps		%xmm3, %xmm2
260	addps		%xmm0, %xmm3
261	subps		%xmm0, %xmm2
262	mulps		%xmm6, %xmm5
263	mulps		%xmm1, %xmm7
264
265	movaps		%xmm2, 16(tmp)
266
267/*
268%xmm3 tmp[0,1,2,3]
269%xmm5 tmp[17,16,15,14]
27016(tmp) tmp[8,7,6,5]
271%xmm7 tmp[9,10,11,12]
2720(tmp) tmp[13,-,4,-]
273*/
274
275	movl		12(%ebp), out1
276	movl		16(%ebp), out2
277	movl		20(%ebp), w
278	movl		24(%ebp), ts
279
280	movaps		%xmm3, %xmm0
281	movaps		%xmm5, %xmm1
282	movups		108(w), %xmm2
283	movups		92(w), %xmm3
284	shufps		$0x1b, %xmm3, %xmm3
285	movups		36(w), %xmm4
286	movups		20(w), %xmm5
287	shufps		$0x1b, %xmm5, %xmm5
288	movaps		%xmm0, %xmm6
289	addps		%xmm1, %xmm0
290	subps		%xmm1, %xmm6
291	mulps		%xmm0, %xmm2
292	mulps		%xmm3, %xmm0
293	mulps		%xmm6, %xmm4
294	mulps		%xmm5, %xmm6
295	movups		36(out1), %xmm1
296	movups		20(out1), %xmm3
297	shufps		$0x1b, %xmm6, %xmm6
298	addps		%xmm4, %xmm1
299	addps		%xmm6, %xmm3
300	shufps		$0x1b, %xmm0, %xmm0
301	movups		%xmm2, 36(out2)
302	movups		%xmm0, 20(out2)
303	movss		%xmm1, 32*36(ts)
304	movss		%xmm3, 32*20(ts)
305	movhlps		%xmm1, %xmm2
306	movhlps		%xmm3, %xmm4
307	movss		%xmm2, 32*44(ts)
308	movss		%xmm4, 32*28(ts)
309	shufps		$0xb1, %xmm1, %xmm1
310	shufps		$0xb1, %xmm3, %xmm3
311	movss		%xmm1, 32*40(ts)
312	movss		%xmm3, 32*24(ts)
313	movhlps		%xmm1, %xmm2
314	movhlps		%xmm3, %xmm4
315	movss		%xmm2, 32*48(ts)
316	movss		%xmm4, 32*32(ts)
317
318	movss		8(tmp), %xmm0
319	movss		(tmp), %xmm1
320	movss		124(w), %xmm2
321	movss		88(w), %xmm3
322	movss		52(w), %xmm4
323	movss		16(w), %xmm5
324	movss		%xmm0, %xmm6
325	addss		%xmm1, %xmm0
326	subss		%xmm1, %xmm6
327	mulss		%xmm0, %xmm2
328	mulss		%xmm3, %xmm0
329	mulss		%xmm6, %xmm4
330	mulss		%xmm5, %xmm6
331	addss		52(out1), %xmm4
332	addss		16(out1), %xmm6
333	movss		%xmm2, 52(out2)
334	movss		%xmm0, 16(out2)
335	movss		%xmm4, 32*52(ts)
336	movss		%xmm6, 32*16(ts)
337
338	movaps		16(tmp), %xmm0
339	movaps		%xmm7, %xmm1
340	MOVUAPS		128(w), %xmm2
341	movups		72(w), %xmm3
342	shufps		$0x1b, %xmm2, %xmm2
343	movlps		56(w), %xmm4
344	movhps		64(w), %xmm4
345	MOVUAPS		(w), %xmm5
346	shufps		$0x1b, %xmm4, %xmm4
347	movaps		%xmm0, %xmm6
348	addps		%xmm1, %xmm0
349	subps		%xmm1, %xmm6
350	mulps		%xmm0, %xmm2
351	mulps		%xmm3, %xmm0
352	mulps		%xmm6, %xmm4
353	mulps		%xmm5, %xmm6
354	movlps		56(out1), %xmm1
355	movhps		64(out1), %xmm1
356	movups		(out1), %xmm3
357	shufps		$0x1b, %xmm4, %xmm4
358	addps		%xmm6, %xmm3
359	addps		%xmm4, %xmm1
360	shufps		$0x1b, %xmm2, %xmm2
361	movups		%xmm0, (out2)
362	movlps		%xmm2, 56(out2)
363	movhps		%xmm2, 64(out2)
364	movss		%xmm1, 32*56(ts)
365	movss		%xmm3, (ts)
366	movhlps		%xmm1, %xmm2
367	movhlps		%xmm3, %xmm4
368	movss		%xmm2, 32*64(ts)
369	movss		%xmm4, 32*8(ts)
370	shufps		$0xb1, %xmm1, %xmm1
371	shufps		$0xb1, %xmm3, %xmm3
372	movss		%xmm1, 32*60(ts)
373	movss		%xmm3, 32*4(ts)
374	movhlps		%xmm1, %xmm2
375	movhlps		%xmm3, %xmm4
376	movss		%xmm2, 32*68(ts)
377	movss		%xmm4, 32*12(ts)
378
379	pop			%edi
380	pop			%esi
381	pop			%ebx
382	mov			%ebp, %esp
383	pop			%ebp
384
385	ret
386
387NONEXEC_STACK
388