1/* 2 dct36_sse: SSE optimized dct36 3 4 copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#define in %edi 12#define out1 %edi 13#define out2 %edx 14#define w %ecx 15#define ts %eax 16#define tmp %esi 17 18/* 19 void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf); 20*/ 21 22#ifndef __APPLE__ 23 .section .rodata 24#else 25 .data 26#endif 27 ALIGN16 28dct36_sse_COS9: 29 .long 0x3f5db3d7 30 .long 0x3f5db3d7 31 .long 0x3f000000 32 .long 0x3f000000 33 .long 0x3f7c1c5c 34 .long 0x3f7c1c5c 35 .long 0x3f708fb2 36 .long 0x3f708fb2 37 .long 0x3f248dbb 38 .long 0x3f248dbb 39 .long 0x3e31d0d4 40 .long 0x3e31d0d4 41 .long 0x3eaf1d44 42 .long 0x3eaf1d44 43 .long 0x3f441b7d 44 .long 0x3f441b7d 45 ALIGN16 46dct36_sse_tfcos36: 47 .long 0x3f007d2b 48 .long 0x3f0483ee 49 .long 0x3f0d3b7d 50 .long 0x3f1c4257 51 .long 0x40b79454 52 .long 0x3ff746ea 53 .long 0x3f976fd9 54 .long 0x3f5f2944 55 .long 0x3f3504f3 56 ALIGN16 57dct36_sse_mask: 58 .long 0,0xffffffff,0,0xffffffff 59 ALIGN16 60dct36_sse_sign: 61 .long 0x80000000,0x80000000,0x80000000,0x80000000 62 .text 63 ALIGN16 64 .globl ASM_NAME(dct36_sse) 65ASM_NAME(dct36_sse): 66 push %ebp 67 mov %esp, %ebp 68 and $-16, %esp 69 sub $80, %esp 70 push %ebx 71 push %esi 72 push %edi 73 lea 12(%esp), tmp 74 movl 8(%ebp), in 75 76 GET_GOT 77 78 lea LOCAL_VAR(dct36_sse_COS9), %eax 79 lea LOCAL_VAR(dct36_sse_tfcos36), %edx 80 81 xorps %xmm0, %xmm0 82 xorps %xmm5, %xmm5 83 movlps 64(in), %xmm5 84 movups 48(in), %xmm4 85 movups 32(in), %xmm3 86 movups 16(in), %xmm2 87 movups (in), %xmm1 88 movaps %xmm5, %xmm6 89 shufps $0xe1, %xmm6, %xmm6 90 movaps %xmm4, %xmm7 91 shufps $0x93, %xmm7, %xmm7 92 movss %xmm7, %xmm6 93 addps %xmm6, %xmm5 94 movaps %xmm3, %xmm6 95 shufps $0x93, %xmm6, %xmm6 96 movss %xmm6, %xmm7 97 addps %xmm7, %xmm4 98 movaps %xmm2, %xmm7 99 shufps $0x93, %xmm7, %xmm7 100 movss %xmm7, %xmm6 101 addps %xmm6, %xmm3 102 movaps %xmm1, %xmm6 103 shufps $0x93, %xmm6, %xmm6 104 movss %xmm6, %xmm7 105 addps %xmm7, %xmm2 106 movss %xmm0, %xmm6 107 addps %xmm6, %xmm1 108 109 movaps LOCAL_VAR(dct36_sse_mask), %xmm0 110 movaps %xmm4, %xmm6 111 shufps $0x4e, %xmm5, %xmm4 112 movaps %xmm3, %xmm7 113 shufps $0x4e, %xmm6, %xmm3 114 andps %xmm0, %xmm6 115 addps %xmm6, %xmm4 116 movaps %xmm2, %xmm6 117 shufps $0x4e, %xmm7, %xmm2 118 andps %xmm0, %xmm7 119 addps %xmm7, %xmm3 120 movaps %xmm1, %xmm7 121 shufps $0x4e, %xmm6, %xmm1 122 andps %xmm0, %xmm6 123 addps %xmm6, %xmm2 124 movaps %xmm7, %xmm6 125 andps %xmm0, %xmm7 126 xorps %xmm0, %xmm0 127 addps %xmm7, %xmm1 128 movlhps %xmm6, %xmm0 129 130/* 131xmm0 in[-,-,0,1] 132xmm1 in[2,3,4,5] 133xmm2 in[6,7,8,9] 134xmm3 in[10,11,12,13] 135xmm4 in[14,15,16,17] 136*/ 137 138 movaps %xmm2, %xmm5 139 shufps $0xe4, %xmm3, %xmm5 140 shufps $0xe4, %xmm4, %xmm3 141 shufps $0xe4, %xmm2, %xmm4 142 movaps %xmm5, %xmm2 143 144/* 145xmm2 in[6,7,12,13] 146xmm3 in[10,11,16,17] 147xmm4 in[14,15,8,9] 148*/ 149 150 mulps (%eax), %xmm5 151 addps %xmm0, %xmm5 152 153 movaps %xmm0, (tmp) 154 movaps %xmm2, 16(tmp) 155 156/* 1570(tmp) in[-,-,0,1] 158xmm5 [ta33,tb33,ta66,tb66] 159*/ 160 161 movaps %xmm1, %xmm6 162 subps %xmm3, %xmm6 163 subps %xmm4, %xmm6 164 xorps %xmm7, %xmm7 165 shufps $0xe0, %xmm2, %xmm7 166 mulps (%eax), %xmm6 167 subps %xmm7, %xmm0 168 addps %xmm0, %xmm6 169 movaps %xmm6, 48(tmp) 170 171 movaps 16(%eax), %xmm2 172 173 movaps %xmm1, %xmm0 174 movaps %xmm3, %xmm6 175 movaps %xmm4, %xmm7 176 mulps %xmm2, %xmm0 177 mulps 32(%eax), %xmm6 178 mulps 48(%eax), %xmm7 179 addps %xmm5, %xmm0 180 addps %xmm7, %xmm6 181 addps %xmm6, %xmm0 182 movaps %xmm0, 32(tmp) 183 184 movaps %xmm1, %xmm0 185 movaps %xmm3, %xmm6 186 movaps %xmm4, %xmm7 187 mulps 32(%eax), %xmm0 188 mulps 48(%eax), %xmm6 189 mulps %xmm2, %xmm7 190 subps %xmm5, %xmm0 191 subps %xmm6, %xmm7 192 addps %xmm7, %xmm0 193 movaps %xmm0, 64(tmp) 194 195 movaps %xmm1, %xmm6 196 movaps %xmm4, %xmm7 197 mulps 48(%eax), %xmm6 198 mulps %xmm3, %xmm2 199 mulps 32(%eax), %xmm7 200 subps %xmm5, %xmm6 201 subps %xmm7, %xmm2 202 addps %xmm2, %xmm6 203 204 movaps (tmp), %xmm0 205 movss 32(%edx), %xmm5 206 subps %xmm1, %xmm0 207 subps 16(tmp), %xmm4 208 addps %xmm3, %xmm0 209 addps %xmm4, %xmm0 210 shufps $0xaf, %xmm0, %xmm0 211 mulss %xmm5, %xmm0 212 movaps %xmm0, (tmp) 213 214 movaps 32(tmp), %xmm0 215 movaps 48(tmp), %xmm1 216 movaps 64(tmp), %xmm2 217 218/* 219xmm0 [1a-0,1b-0, 2a-0, 2b-0] 220xmm1 [1a-1,1b-1, 2a-1, 2b-1] 221xmm2 [1a-2,1b-2,-2a-2,-2b-2] 222xmm6 [1a-3,1b-3,-2a-3,-2b-3] 223*/ 224 225 movaps %xmm0, %xmm3 226 unpcklps %xmm1, %xmm0 227 unpckhps %xmm1, %xmm3 228 movaps %xmm2, %xmm5 229 unpcklps %xmm6, %xmm2 230 unpckhps %xmm6, %xmm5 231 xorps LOCAL_VAR(dct36_sse_sign), %xmm5 232 233/* 234xmm0 [1a-0,1a-1,1b-0,1b-1] 235xmm3 [2a-0,2a-1,2b-0,2b-1] 236xmm2 [1a-2,1a-3,1b-2,1b-3] 237xmm5 [2a-2,2a-3,2b-2,2b-3] 238*/ 239 240 movaps %xmm0, %xmm1 241 movlhps %xmm2, %xmm0 242 movhlps %xmm1, %xmm2 243 movaps %xmm3, %xmm4 244 movlhps %xmm5, %xmm3 245 movhlps %xmm4, %xmm5 246 247/* 248xmm0 tmp1a 249xmm3 tmp2a 250xmm2 tmp1b 251xmm5 tmp2b 252*/ 253 254 movaps (%edx), %xmm6 255 movaps 16(%edx), %xmm7 256 movaps %xmm5, %xmm1 257 addps %xmm2, %xmm5 258 subps %xmm2, %xmm1 259 movaps %xmm3, %xmm2 260 addps %xmm0, %xmm3 261 subps %xmm0, %xmm2 262 mulps %xmm6, %xmm5 263 mulps %xmm1, %xmm7 264 265 movaps %xmm2, 16(tmp) 266 267/* 268%xmm3 tmp[0,1,2,3] 269%xmm5 tmp[17,16,15,14] 27016(tmp) tmp[8,7,6,5] 271%xmm7 tmp[9,10,11,12] 2720(tmp) tmp[13,-,4,-] 273*/ 274 275 movl 12(%ebp), out1 276 movl 16(%ebp), out2 277 movl 20(%ebp), w 278 movl 24(%ebp), ts 279 280 movaps %xmm3, %xmm0 281 movaps %xmm5, %xmm1 282 movups 108(w), %xmm2 283 movups 92(w), %xmm3 284 shufps $0x1b, %xmm3, %xmm3 285 movups 36(w), %xmm4 286 movups 20(w), %xmm5 287 shufps $0x1b, %xmm5, %xmm5 288 movaps %xmm0, %xmm6 289 addps %xmm1, %xmm0 290 subps %xmm1, %xmm6 291 mulps %xmm0, %xmm2 292 mulps %xmm3, %xmm0 293 mulps %xmm6, %xmm4 294 mulps %xmm5, %xmm6 295 movups 36(out1), %xmm1 296 movups 20(out1), %xmm3 297 shufps $0x1b, %xmm6, %xmm6 298 addps %xmm4, %xmm1 299 addps %xmm6, %xmm3 300 shufps $0x1b, %xmm0, %xmm0 301 movups %xmm2, 36(out2) 302 movups %xmm0, 20(out2) 303 movss %xmm1, 32*36(ts) 304 movss %xmm3, 32*20(ts) 305 movhlps %xmm1, %xmm2 306 movhlps %xmm3, %xmm4 307 movss %xmm2, 32*44(ts) 308 movss %xmm4, 32*28(ts) 309 shufps $0xb1, %xmm1, %xmm1 310 shufps $0xb1, %xmm3, %xmm3 311 movss %xmm1, 32*40(ts) 312 movss %xmm3, 32*24(ts) 313 movhlps %xmm1, %xmm2 314 movhlps %xmm3, %xmm4 315 movss %xmm2, 32*48(ts) 316 movss %xmm4, 32*32(ts) 317 318 movss 8(tmp), %xmm0 319 movss (tmp), %xmm1 320 movss 124(w), %xmm2 321 movss 88(w), %xmm3 322 movss 52(w), %xmm4 323 movss 16(w), %xmm5 324 movss %xmm0, %xmm6 325 addss %xmm1, %xmm0 326 subss %xmm1, %xmm6 327 mulss %xmm0, %xmm2 328 mulss %xmm3, %xmm0 329 mulss %xmm6, %xmm4 330 mulss %xmm5, %xmm6 331 addss 52(out1), %xmm4 332 addss 16(out1), %xmm6 333 movss %xmm2, 52(out2) 334 movss %xmm0, 16(out2) 335 movss %xmm4, 32*52(ts) 336 movss %xmm6, 32*16(ts) 337 338 movaps 16(tmp), %xmm0 339 movaps %xmm7, %xmm1 340 MOVUAPS 128(w), %xmm2 341 movups 72(w), %xmm3 342 shufps $0x1b, %xmm2, %xmm2 343 movlps 56(w), %xmm4 344 movhps 64(w), %xmm4 345 MOVUAPS (w), %xmm5 346 shufps $0x1b, %xmm4, %xmm4 347 movaps %xmm0, %xmm6 348 addps %xmm1, %xmm0 349 subps %xmm1, %xmm6 350 mulps %xmm0, %xmm2 351 mulps %xmm3, %xmm0 352 mulps %xmm6, %xmm4 353 mulps %xmm5, %xmm6 354 movlps 56(out1), %xmm1 355 movhps 64(out1), %xmm1 356 movups (out1), %xmm3 357 shufps $0x1b, %xmm4, %xmm4 358 addps %xmm6, %xmm3 359 addps %xmm4, %xmm1 360 shufps $0x1b, %xmm2, %xmm2 361 movups %xmm0, (out2) 362 movlps %xmm2, 56(out2) 363 movhps %xmm2, 64(out2) 364 movss %xmm1, 32*56(ts) 365 movss %xmm3, (ts) 366 movhlps %xmm1, %xmm2 367 movhlps %xmm3, %xmm4 368 movss %xmm2, 32*64(ts) 369 movss %xmm4, 32*8(ts) 370 shufps $0xb1, %xmm1, %xmm1 371 shufps $0xb1, %xmm3, %xmm3 372 movss %xmm1, 32*60(ts) 373 movss %xmm3, 32*4(ts) 374 movhlps %xmm1, %xmm2 375 movhlps %xmm3, %xmm4 376 movss %xmm2, 32*68(ts) 377 movss %xmm4, 32*12(ts) 378 379 pop %edi 380 pop %esi 381 pop %ebx 382 mov %ebp, %esp 383 pop %ebp 384 385 ret 386 387NONEXEC_STACK 388