1 /*
2 	decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
3 
4 	copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
5 	see COPYING and AUTHORS files in distribution or http://mpg123.org
6 	initially written by the mysterious higway for MMX (apparently)
7 	then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
8 	Both have agreed to distribution under LGPL 2.1 .
9 
10 	Transformed back into standalone asm, with help of
11 	gcc -S -DHAVE_CONFIG_H -I.  -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
12 
13 	The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
14 	This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
15 	That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
16 	Maybe I'll change it in future, but now I need something that works.
17 
18 	Original comment from MPlayer source follows. Regarding the license history see
19 	synth_mmx.S, which the original comment about this being licensed under GPL is
20 	relating to.
21 */
22 
23 /*
24  * This code was taken from http://www.mpg123.org
25  * See ChangeLog of mpg123-0.59s-pre.1 for detail
26  * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
27  *
28  * Local ChangeLog:
29  * - Partial loops unrolling and removing MOVW insn from loops
30 */
31 
32 #include "mangle.h"
33 
34 	.data
35 	ALIGN8
36 one_null:
37 	.long	-65536
38 	.long	-65536
39 	ALIGN8
40 null_one:
41 	.long	65535
42 	.long	65535
43 
44 	.text
45 	ALIGN16
46 	/* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
47 .globl SYNTH_NAME
48 SYNTH_NAME:
49 	pushl	%ebp
50 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
51 	movl	%esp, %ebp
52 
53 /* Now the old stack addresses are preserved via %epb. */
54 #ifdef PIC
55 	subl  $8,%esp /* What has been called temp before. */
56 #else
57 	subl  $4,%esp /* What has been called temp before. */
58 #endif
59 	pushl	%edi
60 	pushl	%esi
61 	pushl	%ebx
62 
63 #ifdef PIC
64 	#undef _EBX_
65 	#define _EBX_ %eax
66 	GET_GOT
67 #define EBXSAVE -4(%ebp)
68 	movl _EBX_, EBXSAVE /* save PIC register */
69 #endif
70 
71 #define TEMP 12(%esp)
72 /* APP */
73 	movl 12(%ebp),%ecx
74 	movl 16(%ebp),%edi
75 	movl $15,%ebx
76 	movl 24(%ebp),%edx
77 	leal (%edi,%ecx,2),%edi
78 	decl %ecx
79 	movl 20(%ebp),%esi
80 	movl (%edx),%eax
81 	jecxz 1f
82 	decl %eax
83 	andl %ebx,%eax
84 	leal 1088(%esi),%esi
85 	movl %eax,(%edx)
86 1:
87 	leal (%esi,%eax,2),%edx
88 	movl %eax,TEMP
89 	incl %eax
90 	andl %ebx,%eax
91 	leal 544(%esi,%eax,2),%ecx
92 	incl %ebx
93 	testl $1, %eax
94 	jnz 2f
95 	xchgl %edx,%ecx
96 	incl TEMP
97 	leal 544(%esi),%esi
98 2:
99 	pushl 8(%ebp)
100 	pushl %edx
101 	pushl %ecx
102 	call MPL_DCT64
103 	addl $12, %esp
104 	leal 1(%ebx), %ecx
105 	subl TEMP,%ebx
106 	pushl %ecx
107 	/* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
108 	movl 28(%ebp),%ecx
109 	leal (%ecx,%ebx,2), %edx
110 	movl (%esp),%ecx /* restore, but leave value on stack */
111 	shrl $1, %ecx
112 #ifdef PIC
113 	movl EBXSAVE, _EBX_
114 #endif
115 	ALIGN16
116 3:
117 	movq  (%edx),%mm0
118 	movq  64(%edx),%mm4
119 	pmaddwd (%esi),%mm0
120 	pmaddwd 32(%esi),%mm4
121 	movq  8(%edx),%mm1
122 	movq  72(%edx),%mm5
123 	pmaddwd 8(%esi),%mm1
124 	pmaddwd 40(%esi),%mm5
125 	movq  16(%edx),%mm2
126 	movq  80(%edx),%mm6
127 	pmaddwd 16(%esi),%mm2
128 	pmaddwd 48(%esi),%mm6
129 	movq  24(%edx),%mm3
130 	movq  88(%edx),%mm7
131 	pmaddwd 24(%esi),%mm3
132 	pmaddwd 56(%esi),%mm7
133 	paddd %mm1,%mm0
134 	paddd %mm5,%mm4
135 	paddd %mm2,%mm0
136 	paddd %mm6,%mm4
137 	paddd %mm3,%mm0
138 	paddd %mm7,%mm4
139 	movq  %mm0,%mm1
140 	movq  %mm4,%mm5
141 	psrlq $32,%mm1
142 	psrlq $32,%mm5
143 	paddd %mm1,%mm0
144 	paddd %mm5,%mm4
145 	psrad $13,%mm0
146 	psrad $13,%mm4
147 	packssdw %mm0,%mm0
148 	packssdw %mm4,%mm4
149 	movq	(%edi), %mm1
150 	punpckldq %mm4, %mm0
151 	pand   LOCAL_VAR(one_null), %mm1
152 	pand   LOCAL_VAR(null_one), %mm0
153 	por    %mm0, %mm1
154 	movq   %mm1,(%edi)
155 	leal 64(%esi),%esi
156 	leal 128(%edx),%edx
157 	leal 8(%edi),%edi
158 	decl %ecx
159 	jnz  3b
160 	popl %ecx
161 	andl $1, %ecx
162 	jecxz 4f
163 	movq  (%edx),%mm0
164 	pmaddwd (%esi),%mm0
165 	movq  8(%edx),%mm1
166 	pmaddwd 8(%esi),%mm1
167 	movq  16(%edx),%mm2
168 	pmaddwd 16(%esi),%mm2
169 	movq  24(%edx),%mm3
170 	pmaddwd 24(%esi),%mm3
171 	paddd %mm1,%mm0
172 	paddd %mm2,%mm0
173 	paddd %mm3,%mm0
174 	movq  %mm0,%mm1
175 	psrlq $32,%mm1
176 	paddd %mm1,%mm0
177 	psrad $13,%mm0
178 	packssdw %mm0,%mm0
179 	movd %mm0,%eax
180 	movw %ax, (%edi)
181 	leal 32(%esi),%esi
182 	leal 64(%edx),%edx
183 	leal 4(%edi),%edi
184 4:
185 	subl $64,%esi
186 	movl $7,%ecx
187 
188 #ifdef PIC
189 	movl EBXSAVE, _EBX_
190 #endif
191 	ALIGN16
192 5:
193 	movq  (%edx),%mm0
194 	movq  64(%edx),%mm4
195 	pmaddwd (%esi),%mm0
196 	pmaddwd -32(%esi),%mm4
197 	movq  8(%edx),%mm1
198 	movq  72(%edx),%mm5
199 	pmaddwd 8(%esi),%mm1
200 	pmaddwd -24(%esi),%mm5
201 	movq  16(%edx),%mm2
202 	movq  80(%edx),%mm6
203 	pmaddwd 16(%esi),%mm2
204 	pmaddwd -16(%esi),%mm6
205 	movq  24(%edx),%mm3
206 	movq  88(%edx),%mm7
207 	pmaddwd 24(%esi),%mm3
208 	pmaddwd -8(%esi),%mm7
209 	paddd %mm1,%mm0
210 	paddd %mm5,%mm4
211 	paddd %mm2,%mm0
212 	paddd %mm6,%mm4
213 	paddd %mm3,%mm0
214 	paddd %mm7,%mm4
215 	movq  %mm0,%mm1
216 	movq  %mm4,%mm5
217 	psrlq $32,%mm1
218 	psrlq $32,%mm5
219 	paddd %mm0,%mm1
220 	paddd %mm4,%mm5
221 	psrad $13,%mm1
222 	psrad $13,%mm5
223 	packssdw %mm1,%mm1
224 	packssdw %mm5,%mm5
225 	psubd %mm0,%mm0
226 	psubd %mm4,%mm4
227 	psubsw %mm1,%mm0
228 	psubsw %mm5,%mm4
229 	movq	(%edi), %mm1
230 	punpckldq %mm4, %mm0
231 	pand   LOCAL_VAR(one_null), %mm1
232 	pand   LOCAL_VAR(null_one), %mm0
233 	por    %mm0, %mm1
234 	movq   %mm1,(%edi)
235 	subl $64,%esi
236 	addl $128,%edx
237 	leal 8(%edi),%edi
238 	decl %ecx
239 	jnz  5b
240 	movq  (%edx),%mm0
241 	pmaddwd (%esi),%mm0
242 	movq  8(%edx),%mm1
243 	pmaddwd 8(%esi),%mm1
244 	movq  16(%edx),%mm2
245 	pmaddwd 16(%esi),%mm2
246 	movq  24(%edx),%mm3
247 	pmaddwd 24(%esi),%mm3
248 	paddd %mm1,%mm0
249 	paddd %mm2,%mm0
250 	paddd %mm3,%mm0
251 	movq  %mm0,%mm1
252 	psrlq $32,%mm1
253 	paddd %mm0,%mm1
254 	psrad $13,%mm1
255 	packssdw %mm1,%mm1
256 	psubd %mm0,%mm0
257 	psubsw %mm1,%mm0
258 	movd %mm0,%eax
259 	movw %ax,(%edi)
260 	emms
261 
262 /* NO_APP */
263 	popl	%ebx
264 	popl	%esi
265 	popl	%edi
266 	mov		%ebp, %esp
267 	popl	%ebp
268 	ret
269