1 /* 2 decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic) 3 4 copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by the mysterious higway for MMX (apparently) 7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec 8 Both have agreed to distribution under LGPL 2.1 . 9 10 Transformed back into standalone asm, with help of 11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c} 12 13 The difference between SSE and 3DNowExt is the dct64 function and the synth function name. 14 This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S... 15 That's not memory efficient since there's doubled code, but it's easier than giving another function pointer. 16 Maybe I'll change it in future, but now I need something that works. 17 18 Original comment from MPlayer source follows. Regarding the license history see 19 synth_mmx.S, which the original comment about this being licensed under GPL is 20 relating to. 21 */ 22 23 /* 24 * This code was taken from http://www.mpg123.org 25 * See ChangeLog of mpg123-0.59s-pre.1 for detail 26 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> 27 * 28 * Local ChangeLog: 29 * - Partial loops unrolling and removing MOVW insn from loops 30 */ 31 32 #include "mangle.h" 33 34 .data 35 ALIGN8 36 one_null: 37 .long -65536 38 .long -65536 39 ALIGN8 40 null_one: 41 .long 65535 42 .long 65535 43 44 .text 45 ALIGN16 46 /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */ 47 .globl SYNTH_NAME 48 SYNTH_NAME: 49 pushl %ebp 50 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */ 51 movl %esp, %ebp 52 53 /* Now the old stack addresses are preserved via %epb. */ 54 #ifdef PIC 55 subl $8,%esp /* What has been called temp before. */ 56 #else 57 subl $4,%esp /* What has been called temp before. */ 58 #endif 59 pushl %edi 60 pushl %esi 61 pushl %ebx 62 63 #ifdef PIC 64 #undef _EBX_ 65 #define _EBX_ %eax 66 GET_GOT 67 #define EBXSAVE -4(%ebp) 68 movl _EBX_, EBXSAVE /* save PIC register */ 69 #endif 70 71 #define TEMP 12(%esp) 72 /* APP */ 73 movl 12(%ebp),%ecx 74 movl 16(%ebp),%edi 75 movl $15,%ebx 76 movl 24(%ebp),%edx 77 leal (%edi,%ecx,2),%edi 78 decl %ecx 79 movl 20(%ebp),%esi 80 movl (%edx),%eax 81 jecxz 1f 82 decl %eax 83 andl %ebx,%eax 84 leal 1088(%esi),%esi 85 movl %eax,(%edx) 86 1: 87 leal (%esi,%eax,2),%edx 88 movl %eax,TEMP 89 incl %eax 90 andl %ebx,%eax 91 leal 544(%esi,%eax,2),%ecx 92 incl %ebx 93 testl $1, %eax 94 jnz 2f 95 xchgl %edx,%ecx 96 incl TEMP 97 leal 544(%esi),%esi 98 2: 99 pushl 8(%ebp) 100 pushl %edx 101 pushl %ecx 102 call MPL_DCT64 103 addl $12, %esp 104 leal 1(%ebx), %ecx 105 subl TEMP,%ebx 106 pushl %ecx 107 /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */ 108 movl 28(%ebp),%ecx 109 leal (%ecx,%ebx,2), %edx 110 movl (%esp),%ecx /* restore, but leave value on stack */ 111 shrl $1, %ecx 112 #ifdef PIC 113 movl EBXSAVE, _EBX_ 114 #endif 115 ALIGN16 116 3: 117 movq (%edx),%mm0 118 movq 64(%edx),%mm4 119 pmaddwd (%esi),%mm0 120 pmaddwd 32(%esi),%mm4 121 movq 8(%edx),%mm1 122 movq 72(%edx),%mm5 123 pmaddwd 8(%esi),%mm1 124 pmaddwd 40(%esi),%mm5 125 movq 16(%edx),%mm2 126 movq 80(%edx),%mm6 127 pmaddwd 16(%esi),%mm2 128 pmaddwd 48(%esi),%mm6 129 movq 24(%edx),%mm3 130 movq 88(%edx),%mm7 131 pmaddwd 24(%esi),%mm3 132 pmaddwd 56(%esi),%mm7 133 paddd %mm1,%mm0 134 paddd %mm5,%mm4 135 paddd %mm2,%mm0 136 paddd %mm6,%mm4 137 paddd %mm3,%mm0 138 paddd %mm7,%mm4 139 movq %mm0,%mm1 140 movq %mm4,%mm5 141 psrlq $32,%mm1 142 psrlq $32,%mm5 143 paddd %mm1,%mm0 144 paddd %mm5,%mm4 145 psrad $13,%mm0 146 psrad $13,%mm4 147 packssdw %mm0,%mm0 148 packssdw %mm4,%mm4 149 movq (%edi), %mm1 150 punpckldq %mm4, %mm0 151 pand LOCAL_VAR(one_null), %mm1 152 pand LOCAL_VAR(null_one), %mm0 153 por %mm0, %mm1 154 movq %mm1,(%edi) 155 leal 64(%esi),%esi 156 leal 128(%edx),%edx 157 leal 8(%edi),%edi 158 decl %ecx 159 jnz 3b 160 popl %ecx 161 andl $1, %ecx 162 jecxz 4f 163 movq (%edx),%mm0 164 pmaddwd (%esi),%mm0 165 movq 8(%edx),%mm1 166 pmaddwd 8(%esi),%mm1 167 movq 16(%edx),%mm2 168 pmaddwd 16(%esi),%mm2 169 movq 24(%edx),%mm3 170 pmaddwd 24(%esi),%mm3 171 paddd %mm1,%mm0 172 paddd %mm2,%mm0 173 paddd %mm3,%mm0 174 movq %mm0,%mm1 175 psrlq $32,%mm1 176 paddd %mm1,%mm0 177 psrad $13,%mm0 178 packssdw %mm0,%mm0 179 movd %mm0,%eax 180 movw %ax, (%edi) 181 leal 32(%esi),%esi 182 leal 64(%edx),%edx 183 leal 4(%edi),%edi 184 4: 185 subl $64,%esi 186 movl $7,%ecx 187 188 #ifdef PIC 189 movl EBXSAVE, _EBX_ 190 #endif 191 ALIGN16 192 5: 193 movq (%edx),%mm0 194 movq 64(%edx),%mm4 195 pmaddwd (%esi),%mm0 196 pmaddwd -32(%esi),%mm4 197 movq 8(%edx),%mm1 198 movq 72(%edx),%mm5 199 pmaddwd 8(%esi),%mm1 200 pmaddwd -24(%esi),%mm5 201 movq 16(%edx),%mm2 202 movq 80(%edx),%mm6 203 pmaddwd 16(%esi),%mm2 204 pmaddwd -16(%esi),%mm6 205 movq 24(%edx),%mm3 206 movq 88(%edx),%mm7 207 pmaddwd 24(%esi),%mm3 208 pmaddwd -8(%esi),%mm7 209 paddd %mm1,%mm0 210 paddd %mm5,%mm4 211 paddd %mm2,%mm0 212 paddd %mm6,%mm4 213 paddd %mm3,%mm0 214 paddd %mm7,%mm4 215 movq %mm0,%mm1 216 movq %mm4,%mm5 217 psrlq $32,%mm1 218 psrlq $32,%mm5 219 paddd %mm0,%mm1 220 paddd %mm4,%mm5 221 psrad $13,%mm1 222 psrad $13,%mm5 223 packssdw %mm1,%mm1 224 packssdw %mm5,%mm5 225 psubd %mm0,%mm0 226 psubd %mm4,%mm4 227 psubsw %mm1,%mm0 228 psubsw %mm5,%mm4 229 movq (%edi), %mm1 230 punpckldq %mm4, %mm0 231 pand LOCAL_VAR(one_null), %mm1 232 pand LOCAL_VAR(null_one), %mm0 233 por %mm0, %mm1 234 movq %mm1,(%edi) 235 subl $64,%esi 236 addl $128,%edx 237 leal 8(%edi),%edi 238 decl %ecx 239 jnz 5b 240 movq (%edx),%mm0 241 pmaddwd (%esi),%mm0 242 movq 8(%edx),%mm1 243 pmaddwd 8(%esi),%mm1 244 movq 16(%edx),%mm2 245 pmaddwd 16(%esi),%mm2 246 movq 24(%edx),%mm3 247 pmaddwd 24(%esi),%mm3 248 paddd %mm1,%mm0 249 paddd %mm2,%mm0 250 paddd %mm3,%mm0 251 movq %mm0,%mm1 252 psrlq $32,%mm1 253 paddd %mm0,%mm1 254 psrad $13,%mm1 255 packssdw %mm1,%mm1 256 psubd %mm0,%mm0 257 psubsw %mm1,%mm0 258 movd %mm0,%eax 259 movw %ax,(%edi) 260 emms 261 262 /* NO_APP */ 263 popl %ebx 264 popl %esi 265 popl %edi 266 mov %ebp, %esp 267 popl %ebp 268 ret 269