1;****************************************************************************** 2;* FFT transform with SSE/3DNow optimizations 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2011 Vitor Sessak 5;* 6;* This algorithm (though not any of the implementation details) is 7;* based on libdjbfft by D. J. Bernstein. 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26; These functions are not individually interchangeable with the C versions. 27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 28; in blocks as conventient to the vector size. 29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 30 31%include "libavutil/x86/x86util.asm" 32 33%if ARCH_X86_64 34%define pointer resq 35%else 36%define pointer resd 37%endif 38 39struc FFTContext 40 .nbits: resd 1 41 .reverse: resd 1 42 .revtab: pointer 1 43 .tmpbuf: pointer 1 44 .mdctsize: resd 1 45 .mdctbits: resd 1 46 .tcos: pointer 1 47 .tsin: pointer 1 48 .fftperm: pointer 1 49 .fftcalc: pointer 1 50 .imdctcalc:pointer 1 51 .imdcthalf:pointer 1 52endstruc 53 54SECTION_RODATA 32 55 56%define M_SQRT1_2 0.70710678118654752440 57%define M_COS_PI_1_8 0.923879532511287 58%define M_COS_PI_3_8 0.38268343236509 59 60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 62 63ps_root2: times 8 dd M_SQRT1_2 64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 66 67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 71ps_m1p1: dd 1<<31, 0 72 73cextern ps_neg 74 75%assign i 16 76%rep 14 77cextern cos_ %+ i 78%assign i i<<1 79%endrep 80 81%if ARCH_X86_64 82 %define pointer dq 83%else 84 %define pointer dd 85%endif 86 87%macro IF0 1+ 88%endmacro 89%macro IF1 1+ 90 %1 91%endmacro 92 93SECTION .text 94 95%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 96 mova %1, %3 97 mova %2, %1 98 pfadd %1, %4 99 pfsub %2, %4 100%endmacro 101 102%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 103 mova %5, %3 104 pfsub %3, %4 105 pfadd %5, %4 ; {t6,t5} 106 pxor %3, [ps_m1p1] ; {t8,t7} 107 mova %6, %1 108 movd [r0+12], %3 109 punpckhdq %3, [r0+8] 110 pfadd %1, %5 ; {r0,i0} 111 pfsub %6, %5 ; {r2,i2} 112 mova %4, %2 113 pfadd %2, %3 ; {r1,i1} 114 pfsub %4, %3 ; {r3,i3} 115 SWAP %3, %6 116%endmacro 117 118; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} 119; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} 120; %3, %4, %5 tmp 121; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} 122; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} 123%macro T8_AVX 5 124 vsubps %5, %1, %2 ; v = %1 - %2 125 vaddps %3, %1, %2 ; w = %1 + %2 126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 127 vpermilps %2, %2, [perm1] 128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} 129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} 130 vsubps %4, %5, %1 ; s = r - q 131 vaddps %1, %5, %1 ; u = r + q 132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} 133 vshufps %5, %4, %1, 0xbb 134 vshufps %3, %4, %1, 0xee 135 vperm2f128 %3, %3, %5, 0x13 136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} 137 vshufps %2, %1, %4, 0xdd 138 vshufps %1, %1, %4, 0x88 139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} 140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} 141 vsubps %5, %1, %3 142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} 143 vsubps %2, %4, %1 ; %2 = v - w 144 vaddps %1, %4, %1 ; %1 = v + w 145%endmacro 146 147; In SSE mode do one fft4 transforms 148; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} 149; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 150; 151; In AVX mode do two fft4 transforms 152; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} 153; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} 154%macro T4_SSE 3 155 subps %3, %1, %2 ; {t3,t4,-t8,t7} 156 addps %1, %1, %2 ; {t1,t2,t6,t5} 157 xorps %3, %3, [ps_p1p1m1p1] 158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} 159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} 160 subps %3, %1, %2 ; {r2,i2,r3,i3} 161 addps %1, %1, %2 ; {r0,i0,r1,i1} 162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} 163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} 164%endmacro 165 166; In SSE mode do one FFT8 167; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} 168; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} 169; 170; In AVX mode do two FFT8 171; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} 172; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} 173; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} 174; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 175%macro T8_SSE 6 176 addps %6, %3, %4 ; {t1,t2,t3,t4} 177 subps %3, %3, %4 ; {r5,i5,r7,i7} 178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} 179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} 180 mulps %4, %4, [ps_root2] 181 addps %3, %3, %4 ; {t8,t7,ta,t9} 182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} 183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} 184 subps %3, %6, %4 ; {t6,t5,tc,tb} 185 addps %6, %6, %4 ; {t1,t2,t9,ta} 186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} 187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} 188 subps %3, %1, %6 ; {r4,r5,r6,r7} 189 addps %1, %1, %6 ; {r0,r1,r2,r3} 190 subps %4, %2, %5 ; {i4,i5,i6,i7} 191 addps %2, %2, %5 ; {i0,i1,i2,i3} 192%endmacro 193 194%macro INTERL 5 195%if cpuflag(avx) 196 vunpckhps %3, %2, %1 197 vunpcklps %2, %2, %1 198 vextractf128 %4(%5), %2, 0 199 vextractf128 %4 %+ H(%5), %3, 0 200 vextractf128 %4(%5 + 1), %2, 1 201 vextractf128 %4 %+ H(%5 + 1), %3, 1 202%elif cpuflag(sse) || cpuflag(3dnow) 203 mova %3, %2 204 unpcklps %2, %1 205 unpckhps %3, %1 206 mova %4(%5), %2 207 mova %4(%5+1), %3 208%endif 209%endmacro 210 211; scheduled for cpu-bound sizes 212%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 213IF%1 mova m4, Z(4) 214IF%1 mova m5, Z(5) 215 mova m0, %2 ; wre 216 mova m1, %3 ; wim 217 mulps m2, m4, m0 ; r2*wre 218IF%1 mova m6, Z2(6) 219 mulps m3, m5, m1 ; i2*wim 220IF%1 mova m7, Z2(7) 221 mulps m4, m4, m1 ; r2*wim 222 mulps m5, m5, m0 ; i2*wre 223 addps m2, m2, m3 ; r2*wre + i2*wim 224 mulps m3, m1, m7 ; i3*wim 225 subps m5, m5, m4 ; i2*wre - r2*wim 226 mulps m1, m1, m6 ; r3*wim 227 mulps m4, m0, m6 ; r3*wre 228 mulps m0, m0, m7 ; i3*wre 229 subps m4, m4, m3 ; r3*wre - i3*wim 230 mova m3, Z(0) 231 addps m0, m0, m1 ; i3*wre + r3*wim 232 subps m1, m4, m2 ; t3 233 addps m4, m4, m2 ; t5 234 subps m3, m3, m4 ; r2 235 addps m4, m4, Z(0) ; r0 236 mova m6, Z(2) 237 mova Z(4), m3 238 mova Z(0), m4 239 subps m3, m5, m0 ; t4 240 subps m4, m6, m3 ; r3 241 addps m3, m3, m6 ; r1 242 mova Z2(6), m4 243 mova Z(2), m3 244 mova m2, Z(3) 245 addps m3, m5, m0 ; t6 246 subps m2, m2, m1 ; i3 247 mova m7, Z(1) 248 addps m1, m1, Z(3) ; i1 249 mova Z2(7), m2 250 mova Z(3), m1 251 subps m4, m7, m3 ; i2 252 addps m3, m3, m7 ; i0 253 mova Z(5), m4 254 mova Z(1), m3 255%endmacro 256 257; scheduled to avoid store->load aliasing 258%macro PASS_BIG 1 ; (!interleave) 259 mova m4, Z(4) ; r2 260 mova m5, Z(5) ; i2 261 mova m0, [wq] ; wre 262 mova m1, [wq+o1q] ; wim 263 mulps m2, m4, m0 ; r2*wre 264 mova m6, Z2(6) ; r3 265 mulps m3, m5, m1 ; i2*wim 266 mova m7, Z2(7) ; i3 267 mulps m4, m4, m1 ; r2*wim 268 mulps m5, m5, m0 ; i2*wre 269 addps m2, m2, m3 ; r2*wre + i2*wim 270 mulps m3, m1, m7 ; i3*wim 271 mulps m1, m1, m6 ; r3*wim 272 subps m5, m5, m4 ; i2*wre - r2*wim 273 mulps m4, m0, m6 ; r3*wre 274 mulps m0, m0, m7 ; i3*wre 275 subps m4, m4, m3 ; r3*wre - i3*wim 276 mova m3, Z(0) 277 addps m0, m0, m1 ; i3*wre + r3*wim 278 subps m1, m4, m2 ; t3 279 addps m4, m4, m2 ; t5 280 subps m3, m3, m4 ; r2 281 addps m4, m4, Z(0) ; r0 282 mova m6, Z(2) 283 mova Z(4), m3 284 mova Z(0), m4 285 subps m3, m5, m0 ; t4 286 subps m4, m6, m3 ; r3 287 addps m3, m3, m6 ; r1 288IF%1 mova Z2(6), m4 289IF%1 mova Z(2), m3 290 mova m2, Z(3) 291 addps m5, m5, m0 ; t6 292 subps m2, m2, m1 ; i3 293 mova m7, Z(1) 294 addps m1, m1, Z(3) ; i1 295IF%1 mova Z2(7), m2 296IF%1 mova Z(3), m1 297 subps m6, m7, m5 ; i2 298 addps m5, m5, m7 ; i0 299IF%1 mova Z(5), m6 300IF%1 mova Z(1), m5 301%if %1==0 302 INTERL m1, m3, m7, Z, 2 303 INTERL m2, m4, m0, Z2, 6 304 305 mova m1, Z(0) 306 mova m2, Z(4) 307 308 INTERL m5, m1, m3, Z, 0 309 INTERL m6, m2, m7, Z, 4 310%endif 311%endmacro 312 313%macro PUNPCK 3 314 mova %3, %1 315 punpckldq %1, %2 316 punpckhdq %3, %2 317%endmacro 318 319%define Z(x) [r0+mmsize*x] 320%define Z2(x) [r0+mmsize*x] 321%define ZH(x) [r0+mmsize*x+mmsize/2] 322 323INIT_YMM avx 324 325%if HAVE_AVX_EXTERNAL 326align 16 327fft8_avx: 328 mova m0, Z(0) 329 mova m1, Z(1) 330 T8_AVX m0, m1, m2, m3, m4 331 mova Z(0), m0 332 mova Z(1), m1 333 ret 334 335 336align 16 337fft16_avx: 338 mova m2, Z(2) 339 mova m3, Z(3) 340 T4_SSE m2, m3, m7 341 342 mova m0, Z(0) 343 mova m1, Z(1) 344 T8_AVX m0, m1, m4, m5, m7 345 346 mova m4, [ps_cos16_1] 347 mova m5, [ps_cos16_2] 348 vmulps m6, m2, m4 349 vmulps m7, m3, m5 350 vaddps m7, m7, m6 351 vmulps m2, m2, m5 352 vmulps m3, m3, m4 353 vsubps m3, m3, m2 354 vblendps m2, m7, m3, 0xf0 355 vperm2f128 m3, m7, m3, 0x21 356 vaddps m4, m2, m3 357 vsubps m2, m3, m2 358 vperm2f128 m2, m2, m2, 0x01 359 vsubps m3, m1, m2 360 vaddps m1, m1, m2 361 vsubps m5, m0, m4 362 vaddps m0, m0, m4 363 vextractf128 Z(0), m0, 0 364 vextractf128 ZH(0), m1, 0 365 vextractf128 Z(1), m0, 1 366 vextractf128 ZH(1), m1, 1 367 vextractf128 Z(2), m5, 0 368 vextractf128 ZH(2), m3, 0 369 vextractf128 Z(3), m5, 1 370 vextractf128 ZH(3), m3, 1 371 ret 372 373align 16 374fft32_avx: 375 call fft16_avx 376 377 mova m0, Z(4) 378 mova m1, Z(5) 379 380 T4_SSE m0, m1, m4 381 382 mova m2, Z(6) 383 mova m3, Z(7) 384 385 T8_SSE m0, m1, m2, m3, m4, m6 386 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} 387 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} 388 389 vperm2f128 m4, m0, m2, 0x20 390 vperm2f128 m5, m1, m3, 0x20 391 vperm2f128 m6, m0, m2, 0x31 392 vperm2f128 m7, m1, m3, 0x31 393 394 PASS_SMALL 0, [cos_32], [cos_32+32] 395 396 ret 397 398fft32_interleave_avx: 399 call fft32_avx 400 mov r2d, 32 401.deint_loop: 402 mova m2, Z(0) 403 mova m3, Z(1) 404 vunpcklps m0, m2, m3 405 vunpckhps m1, m2, m3 406 vextractf128 Z(0), m0, 0 407 vextractf128 ZH(0), m1, 0 408 vextractf128 Z(1), m0, 1 409 vextractf128 ZH(1), m1, 1 410 add r0, mmsize*2 411 sub r2d, mmsize/4 412 jg .deint_loop 413 ret 414 415%endif 416 417INIT_XMM sse 418 419align 16 420fft4_avx: 421fft4_sse: 422 mova m0, Z(0) 423 mova m1, Z(1) 424 T4_SSE m0, m1, m2 425 mova Z(0), m0 426 mova Z(1), m1 427 ret 428 429align 16 430fft8_sse: 431 mova m0, Z(0) 432 mova m1, Z(1) 433 T4_SSE m0, m1, m2 434 mova m2, Z(2) 435 mova m3, Z(3) 436 T8_SSE m0, m1, m2, m3, m4, m5 437 mova Z(0), m0 438 mova Z(1), m1 439 mova Z(2), m2 440 mova Z(3), m3 441 ret 442 443align 16 444fft16_sse: 445 mova m0, Z(0) 446 mova m1, Z(1) 447 T4_SSE m0, m1, m2 448 mova m2, Z(2) 449 mova m3, Z(3) 450 T8_SSE m0, m1, m2, m3, m4, m5 451 mova m4, Z(4) 452 mova m5, Z(5) 453 mova Z(0), m0 454 mova Z(1), m1 455 mova Z(2), m2 456 mova Z(3), m3 457 T4_SSE m4, m5, m6 458 mova m6, Z2(6) 459 mova m7, Z2(7) 460 T4_SSE m6, m7, m0 461 PASS_SMALL 0, [cos_16], [cos_16+16] 462 ret 463 464 465%macro FFT48_3DNOW 0 466align 16 467fft4 %+ SUFFIX: 468 T2_3DNOW m0, m1, Z(0), Z(1) 469 mova m2, Z(2) 470 mova m3, Z(3) 471 T4_3DNOW m0, m1, m2, m3, m4, m5 472 PUNPCK m0, m1, m4 473 PUNPCK m2, m3, m5 474 mova Z(0), m0 475 mova Z(1), m4 476 mova Z(2), m2 477 mova Z(3), m5 478 ret 479 480align 16 481fft8 %+ SUFFIX: 482 T2_3DNOW m0, m1, Z(0), Z(1) 483 mova m2, Z(2) 484 mova m3, Z(3) 485 T4_3DNOW m0, m1, m2, m3, m4, m5 486 mova Z(0), m0 487 mova Z(2), m2 488 T2_3DNOW m4, m5, Z(4), Z(5) 489 T2_3DNOW m6, m7, Z2(6), Z2(7) 490 PSWAPD m0, m5 491 PSWAPD m2, m7 492 pxor m0, [ps_m1p1] 493 pxor m2, [ps_m1p1] 494 pfsub m5, m0 495 pfadd m7, m2 496 pfmul m5, [ps_root2] 497 pfmul m7, [ps_root2] 498 T4_3DNOW m1, m3, m5, m7, m0, m2 499 mova Z(5), m5 500 mova Z2(7), m7 501 mova m0, Z(0) 502 mova m2, Z(2) 503 T4_3DNOW m0, m2, m4, m6, m5, m7 504 PUNPCK m0, m1, m5 505 PUNPCK m2, m3, m7 506 mova Z(0), m0 507 mova Z(1), m5 508 mova Z(2), m2 509 mova Z(3), m7 510 PUNPCK m4, Z(5), m5 511 PUNPCK m6, Z2(7), m7 512 mova Z(4), m4 513 mova Z(5), m5 514 mova Z2(6), m6 515 mova Z2(7), m7 516 ret 517%endmacro 518 519%if ARCH_X86_32 520INIT_MMX 3dnowext 521FFT48_3DNOW 522 523INIT_MMX 3dnow 524FFT48_3DNOW 525%endif 526 527%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] 528%define Z2(x) [zcq + o3q + mmsize*(x&1)] 529%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] 530%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] 531 532%macro DECL_PASS 2+ ; name, payload 533align 16 534%1: 535DEFINE_ARGS zc, w, n, o1, o3 536 lea o3q, [nq*3] 537 lea o1q, [nq*8] 538 shl o3q, 4 539.loop: 540 %2 541 add zcq, mmsize*2 542 add wq, mmsize 543 sub nd, mmsize/8 544 jg .loop 545 rep ret 546%endmacro 547 548%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs 549 lea r2, [dispatch_tab%1] 550 mov r2, [r2 + (%2q-2)*gprsize] 551%ifdef PIC 552 lea r3, [$$] 553 add r2, r3 554%endif 555 call r2 556%endmacro ; FFT_DISPATCH 557 558INIT_YMM avx 559 560%if HAVE_AVX_EXTERNAL 561DECL_PASS pass_avx, PASS_BIG 1 562DECL_PASS pass_interleave_avx, PASS_BIG 0 563 564cglobal fft_calc, 2,5,8 565 mov r3d, [r0 + FFTContext.nbits] 566 mov r0, r1 567 mov r1, r3 568 FFT_DISPATCH _interleave %+ SUFFIX, r1 569 REP_RET 570 571%endif 572 573INIT_XMM sse 574 575DECL_PASS pass_sse, PASS_BIG 1 576DECL_PASS pass_interleave_sse, PASS_BIG 0 577 578%macro FFT_CALC_FUNC 0 579cglobal fft_calc, 2,5,8 580 mov r3d, [r0 + FFTContext.nbits] 581 PUSH r1 582 PUSH r3 583 mov r0, r1 584 mov r1, r3 585 FFT_DISPATCH _interleave %+ SUFFIX, r1 586 POP rcx 587 POP r4 588 cmp rcx, 3+(mmsize/16) 589 jg .end 590 mov r2, -1 591 add rcx, 3 592 shl r2, cl 593 sub r4, r2 594.loop: 595%if mmsize == 8 596 PSWAPD m0, [r4 + r2 + 4] 597 mova [r4 + r2 + 4], m0 598%else 599 movaps xmm0, [r4 + r2] 600 movaps xmm1, xmm0 601 unpcklps xmm0, [r4 + r2 + 16] 602 unpckhps xmm1, [r4 + r2 + 16] 603 movaps [r4 + r2], xmm0 604 movaps [r4 + r2 + 16], xmm1 605%endif 606 add r2, mmsize*2 607 jl .loop 608.end: 609%if cpuflag(3dnow) 610 femms 611 RET 612%else 613 REP_RET 614%endif 615%endmacro 616 617%if ARCH_X86_32 618INIT_MMX 3dnow 619FFT_CALC_FUNC 620INIT_MMX 3dnowext 621FFT_CALC_FUNC 622%endif 623INIT_XMM sse 624FFT_CALC_FUNC 625 626cglobal fft_permute, 2,7,1 627 mov r4, [r0 + FFTContext.revtab] 628 mov r5, [r0 + FFTContext.tmpbuf] 629 mov ecx, [r0 + FFTContext.nbits] 630 mov r2, 1 631 shl r2, cl 632 xor r0, r0 633%if ARCH_X86_32 634 mov r1, r1m 635%endif 636.loop: 637 movaps xmm0, [r1 + 8*r0] 638 movzx r6, word [r4 + 2*r0] 639 movzx r3, word [r4 + 2*r0 + 2] 640 movlps [r5 + 8*r6], xmm0 641 movhps [r5 + 8*r3], xmm0 642 add r0, 2 643 cmp r0, r2 644 jl .loop 645 shl r2, 3 646 add r1, r2 647 add r5, r2 648 neg r2 649; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B 650.loopcopy: 651 movaps xmm0, [r5 + r2] 652 movaps xmm1, [r5 + r2 + 16] 653 movaps [r1 + r2], xmm0 654 movaps [r1 + r2 + 16], xmm1 655 add r2, 32 656 jl .loopcopy 657 REP_RET 658 659%macro IMDCT_CALC_FUNC 0 660cglobal imdct_calc, 3,5,3 661 mov r3d, [r0 + FFTContext.mdctsize] 662 mov r4, [r0 + FFTContext.imdcthalf] 663 add r1, r3 664 PUSH r3 665 PUSH r1 666%if ARCH_X86_32 667 push r2 668 push r1 669 push r0 670%else 671 sub rsp, 8+32*WIN64 ; allocate win64 shadow space 672%endif 673 call r4 674%if ARCH_X86_32 675 add esp, 12 676%else 677 add rsp, 8+32*WIN64 678%endif 679 POP r1 680 POP r3 681 lea r0, [r1 + 2*r3] 682 mov r2, r3 683 sub r3, mmsize 684 neg r2 685 mova m2, [ps_neg] 686.loop: 687%if mmsize == 8 688 PSWAPD m0, [r1 + r3] 689 PSWAPD m1, [r0 + r2] 690 pxor m0, m2 691%else 692 mova m0, [r1 + r3] 693 mova m1, [r0 + r2] 694 shufps m0, m0, 0x1b 695 shufps m1, m1, 0x1b 696 xorps m0, m2 697%endif 698 mova [r0 + r3], m1 699 mova [r1 + r2], m0 700 sub r3, mmsize 701 add r2, mmsize 702 jl .loop 703%if cpuflag(3dnow) 704 femms 705 RET 706%else 707 REP_RET 708%endif 709%endmacro 710 711%if ARCH_X86_32 712INIT_MMX 3dnow 713IMDCT_CALC_FUNC 714INIT_MMX 3dnowext 715IMDCT_CALC_FUNC 716%endif 717 718INIT_XMM sse 719IMDCT_CALC_FUNC 720 721%if ARCH_X86_32 722INIT_MMX 3dnow 723%define mulps pfmul 724%define addps pfadd 725%define subps pfsub 726%define unpcklps punpckldq 727%define unpckhps punpckhdq 728DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] 729DECL_PASS pass_interleave_3dnow, PASS_BIG 0 730%define pass_3dnowext pass_3dnow 731%define pass_interleave_3dnowext pass_interleave_3dnow 732%endif 733 734%ifdef PIC 735%define SECTION_REL - $$ 736%else 737%define SECTION_REL 738%endif 739 740%macro DECL_FFT 1-2 ; nbits, suffix 741%ifidn %0, 1 742%xdefine fullsuffix SUFFIX 743%else 744%xdefine fullsuffix %2 %+ SUFFIX 745%endif 746%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL 747%if %1>=5 748%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL 749%endif 750%if %1>=6 751%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL 752%endif 753 754%assign n 1<<%1 755%rep 18-%1 756%assign n2 n/2 757%assign n4 n/4 758%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL 759 760align 16 761fft %+ n %+ fullsuffix: 762 call fft %+ n2 %+ SUFFIX 763 add r0, n*4 - (n&(-2<<%1)) 764 call fft %+ n4 %+ SUFFIX 765 add r0, n*2 - (n2&(-2<<%1)) 766 call fft %+ n4 %+ SUFFIX 767 sub r0, n*6 + (n2&(-2<<%1)) 768 lea r1, [cos_ %+ n] 769 mov r2d, n4/2 770 jmp pass %+ fullsuffix 771 772%assign n n*2 773%endrep 774%undef n 775 776align 8 777dispatch_tab %+ fullsuffix: pointer list_of_fft 778%endmacro ; DECL_FFT 779 780%if HAVE_AVX_EXTERNAL 781INIT_YMM avx 782DECL_FFT 6 783DECL_FFT 6, _interleave 784%endif 785INIT_XMM sse 786DECL_FFT 5 787DECL_FFT 5, _interleave 788%if ARCH_X86_32 789INIT_MMX 3dnow 790DECL_FFT 4 791DECL_FFT 4, _interleave 792INIT_MMX 3dnowext 793DECL_FFT 4 794DECL_FFT 4, _interleave 795%endif 796 797INIT_XMM sse 798%undef mulps 799%undef addps 800%undef subps 801%undef unpcklps 802%undef unpckhps 803 804%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 805%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 806 PSWAPD m0, [%3+%2*4] 807 movq m2, [%3+%1*4-8] 808 movq m3, m0 809 punpckldq m0, m2 810 punpckhdq m2, m3 811 movd m1, [%4+%1*2-4] ; tcos[j] 812 movd m3, [%4+%2*2] ; tcos[n4-j-1] 813 punpckldq m1, [%5+%1*2-4] ; tsin[j] 814 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] 815 816 mova m4, m0 817 PSWAPD m5, m1 818 pfmul m0, m1 819 pfmul m4, m5 820 mova m6, m2 821 PSWAPD m5, m3 822 pfmul m2, m3 823 pfmul m6, m5 824%if cpuflag(3dnowext) 825 pfpnacc m0, m4 826 pfpnacc m2, m6 827%else 828 SBUTTERFLY dq, 0, 4, 1 829 SBUTTERFLY dq, 2, 6, 3 830 pxor m4, m7 831 pxor m6, m7 832 pfadd m0, m4 833 pfadd m2, m6 834%endif 835%else 836 movaps xmm0, [%3+%2*4] 837 movaps xmm1, [%3+%1*4-0x10] 838 movaps xmm2, xmm0 839 shufps xmm0, xmm1, 0x88 840 shufps xmm1, xmm2, 0x77 841 movlps xmm4, [%4+%2*2] 842 movlps xmm5, [%5+%2*2+0x0] 843 movhps xmm4, [%4+%1*2-0x8] 844 movhps xmm5, [%5+%1*2-0x8] 845 movaps xmm2, xmm0 846 movaps xmm3, xmm1 847 mulps xmm0, xmm5 848 mulps xmm1, xmm4 849 mulps xmm2, xmm4 850 mulps xmm3, xmm5 851 subps xmm1, xmm0 852 addps xmm2, xmm3 853 movaps xmm0, xmm1 854 unpcklps xmm1, xmm2 855 unpckhps xmm0, xmm2 856%endif 857%endmacro 858 859%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 860%if cpuflag(sse) 861 mulps m6, %3, [%5+%1] 862 mulps m7, %2, [%5+%1] 863 mulps %2, %2, [%6+%1] 864 mulps %3, %3, [%6+%1] 865 subps %2, %2, m6 866 addps %3, %3, m7 867%elif cpuflag(3dnow) 868 mova m6, [%1+%2*2] 869 mova %3, [%1+%2*2+8] 870 mova %4, m6 871 mova m7, %3 872 pfmul m6, [%5+%2] 873 pfmul %3, [%6+%2] 874 pfmul %4, [%6+%2] 875 pfmul m7, [%5+%2] 876 pfsub %3, m6 877 pfadd %4, m7 878%endif 879%endmacro 880 881%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 882.post: 883%if cpuflag(avx) 884 vmovaps ymm1, [%3+%1*2] 885 vmovaps ymm0, [%3+%1*2+0x20] 886 vmovaps ymm3, [%3+%2*2] 887 vmovaps ymm2, [%3+%2*2+0x20] 888 889 CMUL %1, ymm0, ymm1, %3, %4, %5 890 CMUL %2, ymm2, ymm3, %3, %4, %5 891 vshufps ymm1, ymm1, ymm1, 0x1b 892 vshufps ymm3, ymm3, ymm3, 0x1b 893 vperm2f128 ymm1, ymm1, ymm1, 0x01 894 vperm2f128 ymm3, ymm3, ymm3, 0x01 895 vunpcklps ymm6, ymm2, ymm1 896 vunpckhps ymm4, ymm2, ymm1 897 vunpcklps ymm7, ymm0, ymm3 898 vunpckhps ymm5, ymm0, ymm3 899 900 vextractf128 [%3+%1*2], ymm7, 0 901 vextractf128 [%3+%1*2+0x10], ymm5, 0 902 vextractf128 [%3+%1*2+0x20], ymm7, 1 903 vextractf128 [%3+%1*2+0x30], ymm5, 1 904 905 vextractf128 [%3+%2*2], ymm6, 0 906 vextractf128 [%3+%2*2+0x10], ymm4, 0 907 vextractf128 [%3+%2*2+0x20], ymm6, 1 908 vextractf128 [%3+%2*2+0x30], ymm4, 1 909 sub %2, 0x20 910 add %1, 0x20 911 jl .post 912%elif cpuflag(sse) 913 movaps xmm1, [%3+%1*2] 914 movaps xmm0, [%3+%1*2+0x10] 915 CMUL %1, xmm0, xmm1, %3, %4, %5 916 movaps xmm5, [%3+%2*2] 917 movaps xmm4, [%3+%2*2+0x10] 918 CMUL %2, xmm4, xmm5, %3, %4, %5 919 shufps xmm1, xmm1, 0x1b 920 shufps xmm5, xmm5, 0x1b 921 movaps xmm6, xmm4 922 unpckhps xmm4, xmm1 923 unpcklps xmm6, xmm1 924 movaps xmm2, xmm0 925 unpcklps xmm0, xmm5 926 unpckhps xmm2, xmm5 927 movaps [%3+%2*2], xmm6 928 movaps [%3+%2*2+0x10], xmm4 929 movaps [%3+%1*2], xmm0 930 movaps [%3+%1*2+0x10], xmm2 931 sub %2, 0x10 932 add %1, 0x10 933 jl .post 934%elif cpuflag(3dnow) 935 CMUL %3, %1, m0, m1, %4, %5 936 CMUL %3, %2, m2, m3, %4, %5 937 movd [%3+%1*2+ 0], m0 938 movd [%3+%2*2+12], m1 939 movd [%3+%2*2+ 0], m2 940 movd [%3+%1*2+12], m3 941 psrlq m0, 32 942 psrlq m1, 32 943 psrlq m2, 32 944 psrlq m3, 32 945 movd [%3+%1*2+ 8], m0 946 movd [%3+%2*2+ 4], m1 947 movd [%3+%2*2+ 8], m2 948 movd [%3+%1*2+ 4], m3 949 sub %2, 8 950 add %1, 8 951 jl .post 952%endif 953%endmacro 954 955%macro DECL_IMDCT 0 956cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input 957%if ARCH_X86_64 958%define rrevtab r7 959%define rtcos r8 960%define rtsin r9 961%else 962%define rrevtab r6 963%define rtsin r6 964%define rtcos r5 965%endif 966 mov r3d, [r0+FFTContext.mdctsize] 967 add r2, r3 968 shr r3, 1 969 mov rtcos, [r0+FFTContext.tcos] 970 mov rtsin, [r0+FFTContext.tsin] 971 add rtcos, r3 972 add rtsin, r3 973%if ARCH_X86_64 == 0 974 push rtcos 975 push rtsin 976%endif 977 shr r3, 1 978 mov rrevtab, [r0+FFTContext.revtab] 979 add rrevtab, r3 980%if ARCH_X86_64 == 0 981 push rrevtab 982%endif 983 984%if mmsize == 8 985 sub r3, 2 986%else 987 sub r3, 4 988%endif 989%if ARCH_X86_64 || mmsize == 8 990 xor r4, r4 991 sub r4, r3 992%endif 993%if notcpuflag(3dnowext) && mmsize == 8 994 movd m7, [ps_neg] 995%endif 996.pre: 997%if ARCH_X86_64 == 0 998;unspill 999%if mmsize != 8 1000 xor r4, r4 1001 sub r4, r3 1002%endif 1003 mov rtcos, [esp+8] 1004 mov rtsin, [esp+4] 1005%endif 1006 1007 PREROTATER r4, r3, r2, rtcos, rtsin 1008%if mmsize == 8 1009 mov r6, [esp] ; rrevtab = ptr+n8 1010 movzx r5, word [rrevtab+r4-2] ; rrevtab[j] 1011 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] 1012 mova [r1+r5*8], m0 1013 mova [r1+r6*8], m2 1014 add r4, 2 1015 sub r3, 2 1016%else 1017%if ARCH_X86_64 1018 movzx r5, word [rrevtab+r4-4] 1019 movzx r6, word [rrevtab+r4-2] 1020 movzx r10, word [rrevtab+r3] 1021 movzx r11, word [rrevtab+r3+2] 1022 movlps [r1+r5 *8], xmm0 1023 movhps [r1+r6 *8], xmm0 1024 movlps [r1+r10*8], xmm1 1025 movhps [r1+r11*8], xmm1 1026 add r4, 4 1027%else 1028 mov r6, [esp] 1029 movzx r5, word [r6+r4-4] 1030 movzx r4, word [r6+r4-2] 1031 movlps [r1+r5*8], xmm0 1032 movhps [r1+r4*8], xmm0 1033 movzx r5, word [r6+r3] 1034 movzx r4, word [r6+r3+2] 1035 movlps [r1+r5*8], xmm1 1036 movhps [r1+r4*8], xmm1 1037%endif 1038 sub r3, 4 1039%endif 1040 jns .pre 1041 1042 mov r5, r0 1043 mov r6, r1 1044 mov r0, r1 1045 mov r1d, [r5+FFTContext.nbits] 1046 1047 FFT_DISPATCH SUFFIX, r1 1048 1049 mov r0d, [r5+FFTContext.mdctsize] 1050 add r6, r0 1051 shr r0, 1 1052%if ARCH_X86_64 == 0 1053%define rtcos r2 1054%define rtsin r3 1055 mov rtcos, [esp+8] 1056 mov rtsin, [esp+4] 1057%endif 1058 neg r0 1059 mov r1, -mmsize 1060 sub r1, r0 1061 POSROTATESHUF r0, r1, r6, rtcos, rtsin 1062%if ARCH_X86_64 == 0 1063 add esp, 12 1064%endif 1065%if mmsize == 8 1066 femms 1067%endif 1068 RET 1069%endmacro 1070 1071DECL_IMDCT 1072 1073%if ARCH_X86_32 1074INIT_MMX 3dnow 1075DECL_IMDCT 1076 1077INIT_MMX 3dnowext 1078DECL_IMDCT 1079%endif 1080 1081INIT_YMM avx 1082 1083%if HAVE_AVX_EXTERNAL 1084DECL_IMDCT 1085%endif 1086