1;****************************************************************************** 2;* FFT transform with SSE/3DNow optimizations 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2011 Vitor Sessak 5;* 6;* This algorithm (though not any of the implementation details) is 7;* based on libdjbfft by D. J. Bernstein. 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26; These functions are not individually interchangeable with the C versions. 27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 28; in blocks as conventient to the vector size. 29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 30 31%include "libavutil/x86/x86util.asm" 32 33%if ARCH_X86_64 34%define pointer resq 35%else 36%define pointer resd 37%endif 38 39SECTION_RODATA 32 40 41struc FFTContext 42 .nbits: resd 1 43 .reverse: resd 1 44 .revtab: pointer 1 45 .tmpbuf: pointer 1 46 .mdctsize: resd 1 47 .mdctbits: resd 1 48 .tcos: pointer 1 49 .tsin: pointer 1 50 .fftperm: pointer 1 51 .fftcalc: pointer 1 52 .imdctcalc:pointer 1 53 .imdcthalf:pointer 1 54endstruc 55 56%define M_SQRT1_2 0.70710678118654752440 57%define M_COS_PI_1_8 0.923879532511287 58%define M_COS_PI_3_8 0.38268343236509 59 60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 62 63ps_root2: times 8 dd M_SQRT1_2 64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 66 67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 71ps_m1p1: dd 1<<31, 0 72 73cextern ps_neg 74 75%assign i 16 76%rep 13 77cextern cos_ %+ i 78%assign i i<<1 79%endrep 80 81%if ARCH_X86_64 82 %define pointer dq 83%else 84 %define pointer dd 85%endif 86 87%macro IF0 1+ 88%endmacro 89%macro IF1 1+ 90 %1 91%endmacro 92 93SECTION_TEXT 94 95%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 96 mova %1, %3 97 mova %2, %1 98 pfadd %1, %4 99 pfsub %2, %4 100%endmacro 101 102%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 103 mova %5, %3 104 pfsub %3, %4 105 pfadd %5, %4 ; {t6,t5} 106 pxor %3, [ps_m1p1] ; {t8,t7} 107 mova %6, %1 108 movd [r0+12], %3 109 punpckhdq %3, [r0+8] 110 pfadd %1, %5 ; {r0,i0} 111 pfsub %6, %5 ; {r2,i2} 112 mova %4, %2 113 pfadd %2, %3 ; {r1,i1} 114 pfsub %4, %3 ; {r3,i3} 115 SWAP %3, %6 116%endmacro 117 118; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} 119; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} 120; %3, %4, %5 tmp 121; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} 122; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} 123%macro T8_AVX 5 124 vsubps %5, %1, %2 ; v = %1 - %2 125 vaddps %3, %1, %2 ; w = %1 + %2 126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 127 vpermilps %2, %2, [perm1] 128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} 129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} 130 vsubps %4, %5, %1 ; s = r - q 131 vaddps %1, %5, %1 ; u = r + q 132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} 133 vshufps %5, %4, %1, 0xbb 134 vshufps %3, %4, %1, 0xee 135 vperm2f128 %3, %3, %5, 0x13 136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} 137 vshufps %2, %1, %4, 0xdd 138 vshufps %1, %1, %4, 0x88 139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} 140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} 141 vsubps %5, %1, %3 142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} 143 vsubps %2, %4, %1 ; %2 = v - w 144 vaddps %1, %4, %1 ; %1 = v + w 145%endmacro 146 147; In SSE mode do one fft4 transforms 148; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} 149; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 150; 151; In AVX mode do two fft4 transforms 152; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} 153; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} 154%macro T4_SSE 3 155 subps %3, %1, %2 ; {t3,t4,-t8,t7} 156 addps %1, %1, %2 ; {t1,t2,t6,t5} 157 xorps %3, %3, [ps_p1p1m1p1] 158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} 159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} 160 subps %3, %1, %2 ; {r2,i2,r3,i3} 161 addps %1, %1, %2 ; {r0,i0,r1,i1} 162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} 163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} 164%endmacro 165 166; In SSE mode do one FFT8 167; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} 168; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} 169; 170; In AVX mode do two FFT8 171; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} 172; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} 173; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} 174; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 175%macro T8_SSE 6 176 addps %6, %3, %4 ; {t1,t2,t3,t4} 177 subps %3, %3, %4 ; {r5,i5,r7,i7} 178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} 179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} 180 mulps %4, %4, [ps_root2] 181 addps %3, %3, %4 ; {t8,t7,ta,t9} 182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} 183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} 184 subps %3, %6, %4 ; {t6,t5,tc,tb} 185 addps %6, %6, %4 ; {t1,t2,t9,ta} 186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} 187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} 188 subps %3, %1, %6 ; {r4,r5,r6,r7} 189 addps %1, %1, %6 ; {r0,r1,r2,r3} 190 subps %4, %2, %5 ; {i4,i5,i6,i7} 191 addps %2, %2, %5 ; {i0,i1,i2,i3} 192%endmacro 193 194; scheduled for cpu-bound sizes 195%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 196IF%1 mova m4, Z(4) 197IF%1 mova m5, Z(5) 198 mova m0, %2 ; wre 199 mova m1, %3 ; wim 200 mulps m2, m4, m0 ; r2*wre 201IF%1 mova m6, Z2(6) 202 mulps m3, m5, m1 ; i2*wim 203IF%1 mova m7, Z2(7) 204 mulps m4, m4, m1 ; r2*wim 205 mulps m5, m5, m0 ; i2*wre 206 addps m2, m2, m3 ; r2*wre + i2*wim 207 mulps m3, m1, m7 ; i3*wim 208 subps m5, m5, m4 ; i2*wre - r2*wim 209 mulps m1, m1, m6 ; r3*wim 210 mulps m4, m0, m6 ; r3*wre 211 mulps m0, m0, m7 ; i3*wre 212 subps m4, m4, m3 ; r3*wre - i3*wim 213 mova m3, Z(0) 214 addps m0, m0, m1 ; i3*wre + r3*wim 215 subps m1, m4, m2 ; t3 216 addps m4, m4, m2 ; t5 217 subps m3, m3, m4 ; r2 218 addps m4, m4, Z(0) ; r0 219 mova m6, Z(2) 220 mova Z(4), m3 221 mova Z(0), m4 222 subps m3, m5, m0 ; t4 223 subps m4, m6, m3 ; r3 224 addps m3, m3, m6 ; r1 225 mova Z2(6), m4 226 mova Z(2), m3 227 mova m2, Z(3) 228 addps m3, m5, m0 ; t6 229 subps m2, m2, m1 ; i3 230 mova m7, Z(1) 231 addps m1, m1, Z(3) ; i1 232 mova Z2(7), m2 233 mova Z(3), m1 234 subps m4, m7, m3 ; i2 235 addps m3, m3, m7 ; i0 236 mova Z(5), m4 237 mova Z(1), m3 238%endmacro 239 240; scheduled to avoid store->load aliasing 241%macro PASS_BIG 1 ; (!interleave) 242 mova m4, Z(4) ; r2 243 mova m5, Z(5) ; i2 244 mova m0, [wq] ; wre 245 mova m1, [wq+o1q] ; wim 246 mulps m2, m4, m0 ; r2*wre 247 mova m6, Z2(6) ; r3 248 mulps m3, m5, m1 ; i2*wim 249 mova m7, Z2(7) ; i3 250 mulps m4, m4, m1 ; r2*wim 251 mulps m5, m5, m0 ; i2*wre 252 addps m2, m2, m3 ; r2*wre + i2*wim 253 mulps m3, m1, m7 ; i3*wim 254 mulps m1, m1, m6 ; r3*wim 255 subps m5, m5, m4 ; i2*wre - r2*wim 256 mulps m4, m0, m6 ; r3*wre 257 mulps m0, m0, m7 ; i3*wre 258 subps m4, m4, m3 ; r3*wre - i3*wim 259 mova m3, Z(0) 260 addps m0, m0, m1 ; i3*wre + r3*wim 261 subps m1, m4, m2 ; t3 262 addps m4, m4, m2 ; t5 263 subps m3, m3, m4 ; r2 264 addps m4, m4, Z(0) ; r0 265 mova m6, Z(2) 266 mova Z(4), m3 267 mova Z(0), m4 268 subps m3, m5, m0 ; t4 269 subps m4, m6, m3 ; r3 270 addps m3, m3, m6 ; r1 271IF%1 mova Z2(6), m4 272IF%1 mova Z(2), m3 273 mova m2, Z(3) 274 addps m5, m5, m0 ; t6 275 subps m2, m2, m1 ; i3 276 mova m7, Z(1) 277 addps m1, m1, Z(3) ; i1 278IF%1 mova Z2(7), m2 279IF%1 mova Z(3), m1 280 subps m6, m7, m5 ; i2 281 addps m5, m5, m7 ; i0 282IF%1 mova Z(5), m6 283IF%1 mova Z(1), m5 284%if %1==0 285 INTERL m1, m3, m7, Z, 2 286 INTERL m2, m4, m0, Z2, 6 287 288 mova m1, Z(0) 289 mova m2, Z(4) 290 291 INTERL m5, m1, m3, Z, 0 292 INTERL m6, m2, m7, Z, 4 293%endif 294%endmacro 295 296%macro PUNPCK 3 297 mova %3, %1 298 punpckldq %1, %2 299 punpckhdq %3, %2 300%endmacro 301 302%define Z(x) [r0+mmsize*x] 303%define Z2(x) [r0+mmsize*x] 304%define ZH(x) [r0+mmsize*x+mmsize/2] 305 306INIT_YMM avx 307 308%if HAVE_AVX_EXTERNAL 309align 16 310fft8_avx: 311 mova m0, Z(0) 312 mova m1, Z(1) 313 T8_AVX m0, m1, m2, m3, m4 314 mova Z(0), m0 315 mova Z(1), m1 316 ret 317 318 319align 16 320fft16_avx: 321 mova m2, Z(2) 322 mova m3, Z(3) 323 T4_SSE m2, m3, m7 324 325 mova m0, Z(0) 326 mova m1, Z(1) 327 T8_AVX m0, m1, m4, m5, m7 328 329 mova m4, [ps_cos16_1] 330 mova m5, [ps_cos16_2] 331 vmulps m6, m2, m4 332 vmulps m7, m3, m5 333 vaddps m7, m7, m6 334 vmulps m2, m2, m5 335 vmulps m3, m3, m4 336 vsubps m3, m3, m2 337 vblendps m2, m7, m3, 0xf0 338 vperm2f128 m3, m7, m3, 0x21 339 vaddps m4, m2, m3 340 vsubps m2, m3, m2 341 vperm2f128 m2, m2, m2, 0x01 342 vsubps m3, m1, m2 343 vaddps m1, m1, m2 344 vsubps m5, m0, m4 345 vaddps m0, m0, m4 346 vextractf128 Z(0), m0, 0 347 vextractf128 ZH(0), m1, 0 348 vextractf128 Z(1), m0, 1 349 vextractf128 ZH(1), m1, 1 350 vextractf128 Z(2), m5, 0 351 vextractf128 ZH(2), m3, 0 352 vextractf128 Z(3), m5, 1 353 vextractf128 ZH(3), m3, 1 354 ret 355 356align 16 357fft32_avx: 358 call fft16_avx 359 360 mova m0, Z(4) 361 mova m1, Z(5) 362 363 T4_SSE m0, m1, m4 364 365 mova m2, Z(6) 366 mova m3, Z(7) 367 368 T8_SSE m0, m1, m2, m3, m4, m6 369 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} 370 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} 371 372 vperm2f128 m4, m0, m2, 0x20 373 vperm2f128 m5, m1, m3, 0x20 374 vperm2f128 m6, m0, m2, 0x31 375 vperm2f128 m7, m1, m3, 0x31 376 377 PASS_SMALL 0, [cos_32], [cos_32+32] 378 379 ret 380 381fft32_interleave_avx: 382 call fft32_avx 383 mov r2d, 32 384.deint_loop: 385 mova m2, Z(0) 386 mova m3, Z(1) 387 vunpcklps m0, m2, m3 388 vunpckhps m1, m2, m3 389 vextractf128 Z(0), m0, 0 390 vextractf128 ZH(0), m1, 0 391 vextractf128 Z(1), m0, 1 392 vextractf128 ZH(1), m1, 1 393 add r0, mmsize*2 394 sub r2d, mmsize/4 395 jg .deint_loop 396 ret 397 398%endif 399 400INIT_XMM sse 401 402align 16 403fft4_avx: 404fft4_sse: 405 mova m0, Z(0) 406 mova m1, Z(1) 407 T4_SSE m0, m1, m2 408 mova Z(0), m0 409 mova Z(1), m1 410 ret 411 412align 16 413fft8_sse: 414 mova m0, Z(0) 415 mova m1, Z(1) 416 T4_SSE m0, m1, m2 417 mova m2, Z(2) 418 mova m3, Z(3) 419 T8_SSE m0, m1, m2, m3, m4, m5 420 mova Z(0), m0 421 mova Z(1), m1 422 mova Z(2), m2 423 mova Z(3), m3 424 ret 425 426align 16 427fft16_sse: 428 mova m0, Z(0) 429 mova m1, Z(1) 430 T4_SSE m0, m1, m2 431 mova m2, Z(2) 432 mova m3, Z(3) 433 T8_SSE m0, m1, m2, m3, m4, m5 434 mova m4, Z(4) 435 mova m5, Z(5) 436 mova Z(0), m0 437 mova Z(1), m1 438 mova Z(2), m2 439 mova Z(3), m3 440 T4_SSE m4, m5, m6 441 mova m6, Z2(6) 442 mova m7, Z2(7) 443 T4_SSE m6, m7, m0 444 PASS_SMALL 0, [cos_16], [cos_16+16] 445 ret 446 447 448%macro FFT48_3DNOW 0 449align 16 450fft4 %+ SUFFIX: 451 T2_3DNOW m0, m1, Z(0), Z(1) 452 mova m2, Z(2) 453 mova m3, Z(3) 454 T4_3DNOW m0, m1, m2, m3, m4, m5 455 PUNPCK m0, m1, m4 456 PUNPCK m2, m3, m5 457 mova Z(0), m0 458 mova Z(1), m4 459 mova Z(2), m2 460 mova Z(3), m5 461 ret 462 463align 16 464fft8 %+ SUFFIX: 465 T2_3DNOW m0, m1, Z(0), Z(1) 466 mova m2, Z(2) 467 mova m3, Z(3) 468 T4_3DNOW m0, m1, m2, m3, m4, m5 469 mova Z(0), m0 470 mova Z(2), m2 471 T2_3DNOW m4, m5, Z(4), Z(5) 472 T2_3DNOW m6, m7, Z2(6), Z2(7) 473 PSWAPD m0, m5 474 PSWAPD m2, m7 475 pxor m0, [ps_m1p1] 476 pxor m2, [ps_m1p1] 477 pfsub m5, m0 478 pfadd m7, m2 479 pfmul m5, [ps_root2] 480 pfmul m7, [ps_root2] 481 T4_3DNOW m1, m3, m5, m7, m0, m2 482 mova Z(5), m5 483 mova Z2(7), m7 484 mova m0, Z(0) 485 mova m2, Z(2) 486 T4_3DNOW m0, m2, m4, m6, m5, m7 487 PUNPCK m0, m1, m5 488 PUNPCK m2, m3, m7 489 mova Z(0), m0 490 mova Z(1), m5 491 mova Z(2), m2 492 mova Z(3), m7 493 PUNPCK m4, Z(5), m5 494 PUNPCK m6, Z2(7), m7 495 mova Z(4), m4 496 mova Z(5), m5 497 mova Z2(6), m6 498 mova Z2(7), m7 499 ret 500%endmacro 501 502%if ARCH_X86_32 503INIT_MMX 3dnowext 504FFT48_3DNOW 505 506INIT_MMX 3dnow 507FFT48_3DNOW 508%endif 509 510%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] 511%define Z2(x) [zcq + o3q + mmsize*(x&1)] 512%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] 513%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] 514 515%macro DECL_PASS 2+ ; name, payload 516align 16 517%1: 518DEFINE_ARGS zc, w, n, o1, o3 519 lea o3q, [nq*3] 520 lea o1q, [nq*8] 521 shl o3q, 4 522.loop: 523 %2 524 add zcq, mmsize*2 525 add wq, mmsize 526 sub nd, mmsize/8 527 jg .loop 528 rep ret 529%endmacro 530 531%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs 532 lea r2, [dispatch_tab%1] 533 mov r2, [r2 + (%2q-2)*gprsize] 534%ifdef PIC 535 lea r3, [$$] 536 add r2, r3 537%endif 538 call r2 539%endmacro ; FFT_DISPATCH 540 541INIT_YMM avx 542 543%if HAVE_AVX_EXTERNAL 544%macro INTERL_AVX 5 545 vunpckhps %3, %2, %1 546 vunpcklps %2, %2, %1 547 vextractf128 %4(%5), %2, 0 548 vextractf128 %4 %+ H(%5), %3, 0 549 vextractf128 %4(%5 + 1), %2, 1 550 vextractf128 %4 %+ H(%5 + 1), %3, 1 551%endmacro 552 553%define INTERL INTERL_AVX 554 555DECL_PASS pass_avx, PASS_BIG 1 556DECL_PASS pass_interleave_avx, PASS_BIG 0 557 558cglobal fft_calc, 2,5,8 559 mov r3d, [r0 + FFTContext.nbits] 560 mov r0, r1 561 mov r1, r3 562 FFT_DISPATCH _interleave %+ SUFFIX, r1 563 REP_RET 564 565%endif 566 567INIT_XMM sse 568 569%macro INTERL_SSE 5 570 mova %3, %2 571 unpcklps %2, %1 572 unpckhps %3, %1 573 mova %4(%5), %2 574 mova %4(%5+1), %3 575%endmacro 576 577%define INTERL INTERL_SSE 578 579DECL_PASS pass_sse, PASS_BIG 1 580DECL_PASS pass_interleave_sse, PASS_BIG 0 581 582%macro FFT_CALC_FUNC 0 583cglobal fft_calc, 2,5,8 584 mov r3d, [r0 + FFTContext.nbits] 585 PUSH r1 586 PUSH r3 587 mov r0, r1 588 mov r1, r3 589 FFT_DISPATCH _interleave %+ SUFFIX, r1 590 POP rcx 591 POP r4 592 cmp rcx, 3+(mmsize/16) 593 jg .end 594 mov r2, -1 595 add rcx, 3 596 shl r2, cl 597 sub r4, r2 598.loop: 599%if mmsize == 8 600 PSWAPD m0, [r4 + r2 + 4] 601 mova [r4 + r2 + 4], m0 602%else 603 movaps xmm0, [r4 + r2] 604 movaps xmm1, xmm0 605 unpcklps xmm0, [r4 + r2 + 16] 606 unpckhps xmm1, [r4 + r2 + 16] 607 movaps [r4 + r2], xmm0 608 movaps [r4 + r2 + 16], xmm1 609%endif 610 add r2, mmsize*2 611 jl .loop 612.end: 613%if cpuflag(3dnow) 614 femms 615 RET 616%else 617 REP_RET 618%endif 619%endmacro 620 621%if ARCH_X86_32 622INIT_MMX 3dnow 623FFT_CALC_FUNC 624INIT_MMX 3dnowext 625FFT_CALC_FUNC 626%endif 627INIT_XMM sse 628FFT_CALC_FUNC 629 630cglobal fft_permute, 2,7,1 631 mov r4, [r0 + FFTContext.revtab] 632 mov r5, [r0 + FFTContext.tmpbuf] 633 mov ecx, [r0 + FFTContext.nbits] 634 mov r2, 1 635 shl r2, cl 636 xor r0, r0 637%if ARCH_X86_32 638 mov r1, r1m 639%endif 640.loop: 641 movaps xmm0, [r1 + 8*r0] 642 movzx r6, word [r4 + 2*r0] 643 movzx r3, word [r4 + 2*r0 + 2] 644 movlps [r5 + 8*r6], xmm0 645 movhps [r5 + 8*r3], xmm0 646 add r0, 2 647 cmp r0, r2 648 jl .loop 649 shl r2, 3 650 add r1, r2 651 add r5, r2 652 neg r2 653; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B 654.loopcopy: 655 movaps xmm0, [r5 + r2] 656 movaps xmm1, [r5 + r2 + 16] 657 movaps [r1 + r2], xmm0 658 movaps [r1 + r2 + 16], xmm1 659 add r2, 32 660 jl .loopcopy 661 REP_RET 662 663%macro IMDCT_CALC_FUNC 0 664cglobal imdct_calc, 3,5,3 665 mov r3d, [r0 + FFTContext.mdctsize] 666 mov r4, [r0 + FFTContext.imdcthalf] 667 add r1, r3 668 PUSH r3 669 PUSH r1 670%if ARCH_X86_32 671 push r2 672 push r1 673 push r0 674%else 675 sub rsp, 8+32*WIN64 ; allocate win64 shadow space 676%endif 677 call r4 678%if ARCH_X86_32 679 add esp, 12 680%else 681 add rsp, 8+32*WIN64 682%endif 683 POP r1 684 POP r3 685 lea r0, [r1 + 2*r3] 686 mov r2, r3 687 sub r3, mmsize 688 neg r2 689 mova m2, [ps_neg] 690.loop: 691%if mmsize == 8 692 PSWAPD m0, [r1 + r3] 693 PSWAPD m1, [r0 + r2] 694 pxor m0, m2 695%else 696 mova m0, [r1 + r3] 697 mova m1, [r0 + r2] 698 shufps m0, m0, 0x1b 699 shufps m1, m1, 0x1b 700 xorps m0, m2 701%endif 702 mova [r0 + r3], m1 703 mova [r1 + r2], m0 704 sub r3, mmsize 705 add r2, mmsize 706 jl .loop 707%if cpuflag(3dnow) 708 femms 709 RET 710%else 711 REP_RET 712%endif 713%endmacro 714 715%if ARCH_X86_32 716INIT_MMX 3dnow 717IMDCT_CALC_FUNC 718INIT_MMX 3dnowext 719IMDCT_CALC_FUNC 720%endif 721 722INIT_XMM sse 723IMDCT_CALC_FUNC 724 725%if ARCH_X86_32 726INIT_MMX 3dnow 727%define mulps pfmul 728%define addps pfadd 729%define subps pfsub 730%define unpcklps punpckldq 731%define unpckhps punpckhdq 732DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] 733DECL_PASS pass_interleave_3dnow, PASS_BIG 0 734%define pass_3dnowext pass_3dnow 735%define pass_interleave_3dnowext pass_interleave_3dnow 736%endif 737 738%ifdef PIC 739%define SECTION_REL - $$ 740%else 741%define SECTION_REL 742%endif 743 744%macro DECL_FFT 1-2 ; nbits, suffix 745%ifidn %0, 1 746%xdefine fullsuffix SUFFIX 747%else 748%xdefine fullsuffix %2 %+ SUFFIX 749%endif 750%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL 751%if %1>=5 752%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL 753%endif 754%if %1>=6 755%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL 756%endif 757 758%assign n 1<<%1 759%rep 17-%1 760%assign n2 n/2 761%assign n4 n/4 762%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL 763 764align 16 765fft %+ n %+ fullsuffix: 766 call fft %+ n2 %+ SUFFIX 767 add r0, n*4 - (n&(-2<<%1)) 768 call fft %+ n4 %+ SUFFIX 769 add r0, n*2 - (n2&(-2<<%1)) 770 call fft %+ n4 %+ SUFFIX 771 sub r0, n*6 + (n2&(-2<<%1)) 772 lea r1, [cos_ %+ n] 773 mov r2d, n4/2 774 jmp pass %+ fullsuffix 775 776%assign n n*2 777%endrep 778%undef n 779 780align 8 781dispatch_tab %+ fullsuffix: pointer list_of_fft 782%endmacro ; DECL_FFT 783 784%if HAVE_AVX_EXTERNAL 785INIT_YMM avx 786DECL_FFT 6 787DECL_FFT 6, _interleave 788%endif 789INIT_XMM sse 790DECL_FFT 5 791DECL_FFT 5, _interleave 792%if ARCH_X86_32 793INIT_MMX 3dnow 794DECL_FFT 4 795DECL_FFT 4, _interleave 796INIT_MMX 3dnowext 797DECL_FFT 4 798DECL_FFT 4, _interleave 799%endif 800 801INIT_XMM sse 802%undef mulps 803%undef addps 804%undef subps 805%undef unpcklps 806%undef unpckhps 807 808%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 809%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 810 PSWAPD m0, [%3+%2*4] 811 movq m2, [%3+%1*4-8] 812 movq m3, m0 813 punpckldq m0, m2 814 punpckhdq m2, m3 815 movd m1, [%4+%1*2-4] ; tcos[j] 816 movd m3, [%4+%2*2] ; tcos[n4-j-1] 817 punpckldq m1, [%5+%1*2-4] ; tsin[j] 818 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] 819 820 mova m4, m0 821 PSWAPD m5, m1 822 pfmul m0, m1 823 pfmul m4, m5 824 mova m6, m2 825 PSWAPD m5, m3 826 pfmul m2, m3 827 pfmul m6, m5 828%if cpuflag(3dnowext) 829 pfpnacc m0, m4 830 pfpnacc m2, m6 831%else 832 SBUTTERFLY dq, 0, 4, 1 833 SBUTTERFLY dq, 2, 6, 3 834 pxor m4, m7 835 pxor m6, m7 836 pfadd m0, m4 837 pfadd m2, m6 838%endif 839%else 840 movaps xmm0, [%3+%2*4] 841 movaps xmm1, [%3+%1*4-0x10] 842 movaps xmm2, xmm0 843 shufps xmm0, xmm1, 0x88 844 shufps xmm1, xmm2, 0x77 845 movlps xmm4, [%4+%2*2] 846 movlps xmm5, [%5+%2*2+0x0] 847 movhps xmm4, [%4+%1*2-0x8] 848 movhps xmm5, [%5+%1*2-0x8] 849 movaps xmm2, xmm0 850 movaps xmm3, xmm1 851 mulps xmm0, xmm5 852 mulps xmm1, xmm4 853 mulps xmm2, xmm4 854 mulps xmm3, xmm5 855 subps xmm1, xmm0 856 addps xmm2, xmm3 857 movaps xmm0, xmm1 858 unpcklps xmm1, xmm2 859 unpckhps xmm0, xmm2 860%endif 861%endmacro 862 863%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 864 mulps m6, %3, [%5+%1] 865 mulps m7, %2, [%5+%1] 866 mulps %2, %2, [%6+%1] 867 mulps %3, %3, [%6+%1] 868 subps %2, %2, m6 869 addps %3, %3, m7 870%endmacro 871 872%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 873.post: 874 vmovaps ymm1, [%3+%1*2] 875 vmovaps ymm0, [%3+%1*2+0x20] 876 vmovaps ymm3, [%3+%2*2] 877 vmovaps ymm2, [%3+%2*2+0x20] 878 879 CMUL %1, ymm0, ymm1, %3, %4, %5 880 CMUL %2, ymm2, ymm3, %3, %4, %5 881 vshufps ymm1, ymm1, ymm1, 0x1b 882 vshufps ymm3, ymm3, ymm3, 0x1b 883 vperm2f128 ymm1, ymm1, ymm1, 0x01 884 vperm2f128 ymm3, ymm3, ymm3, 0x01 885 vunpcklps ymm6, ymm2, ymm1 886 vunpckhps ymm4, ymm2, ymm1 887 vunpcklps ymm7, ymm0, ymm3 888 vunpckhps ymm5, ymm0, ymm3 889 890 vextractf128 [%3+%1*2], ymm7, 0 891 vextractf128 [%3+%1*2+0x10], ymm5, 0 892 vextractf128 [%3+%1*2+0x20], ymm7, 1 893 vextractf128 [%3+%1*2+0x30], ymm5, 1 894 895 vextractf128 [%3+%2*2], ymm6, 0 896 vextractf128 [%3+%2*2+0x10], ymm4, 0 897 vextractf128 [%3+%2*2+0x20], ymm6, 1 898 vextractf128 [%3+%2*2+0x30], ymm4, 1 899 sub %2, 0x20 900 add %1, 0x20 901 jl .post 902%endmacro 903 904%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 905.post: 906 movaps xmm1, [%3+%1*2] 907 movaps xmm0, [%3+%1*2+0x10] 908 CMUL %1, xmm0, xmm1, %3, %4, %5 909 movaps xmm5, [%3+%2*2] 910 movaps xmm4, [%3+%2*2+0x10] 911 CMUL %2, xmm4, xmm5, %3, %4, %5 912 shufps xmm1, xmm1, 0x1b 913 shufps xmm5, xmm5, 0x1b 914 movaps xmm6, xmm4 915 unpckhps xmm4, xmm1 916 unpcklps xmm6, xmm1 917 movaps xmm2, xmm0 918 unpcklps xmm0, xmm5 919 unpckhps xmm2, xmm5 920 movaps [%3+%2*2], xmm6 921 movaps [%3+%2*2+0x10], xmm4 922 movaps [%3+%1*2], xmm0 923 movaps [%3+%1*2+0x10], xmm2 924 sub %2, 0x10 925 add %1, 0x10 926 jl .post 927%endmacro 928 929%macro CMUL_3DNOW 6 930 mova m6, [%1+%2*2] 931 mova %3, [%1+%2*2+8] 932 mova %4, m6 933 mova m7, %3 934 pfmul m6, [%5+%2] 935 pfmul %3, [%6+%2] 936 pfmul %4, [%6+%2] 937 pfmul m7, [%5+%2] 938 pfsub %3, m6 939 pfadd %4, m7 940%endmacro 941 942%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 943.post: 944 CMUL_3DNOW %3, %1, m0, m1, %4, %5 945 CMUL_3DNOW %3, %2, m2, m3, %4, %5 946 movd [%3+%1*2+ 0], m0 947 movd [%3+%2*2+12], m1 948 movd [%3+%2*2+ 0], m2 949 movd [%3+%1*2+12], m3 950 psrlq m0, 32 951 psrlq m1, 32 952 psrlq m2, 32 953 psrlq m3, 32 954 movd [%3+%1*2+ 8], m0 955 movd [%3+%2*2+ 4], m1 956 movd [%3+%2*2+ 8], m2 957 movd [%3+%1*2+ 4], m3 958 sub %2, 8 959 add %1, 8 960 jl .post 961%endmacro 962 963%macro DECL_IMDCT 1 964cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input 965%if ARCH_X86_64 966%define rrevtab r7 967%define rtcos r8 968%define rtsin r9 969%else 970%define rrevtab r6 971%define rtsin r6 972%define rtcos r5 973%endif 974 mov r3d, [r0+FFTContext.mdctsize] 975 add r2, r3 976 shr r3, 1 977 mov rtcos, [r0+FFTContext.tcos] 978 mov rtsin, [r0+FFTContext.tsin] 979 add rtcos, r3 980 add rtsin, r3 981%if ARCH_X86_64 == 0 982 push rtcos 983 push rtsin 984%endif 985 shr r3, 1 986 mov rrevtab, [r0+FFTContext.revtab] 987 add rrevtab, r3 988%if ARCH_X86_64 == 0 989 push rrevtab 990%endif 991 992%if mmsize == 8 993 sub r3, 2 994%else 995 sub r3, 4 996%endif 997%if ARCH_X86_64 || mmsize == 8 998 xor r4, r4 999 sub r4, r3 1000%endif 1001%if notcpuflag(3dnowext) && mmsize == 8 1002 movd m7, [ps_neg] 1003%endif 1004.pre: 1005%if ARCH_X86_64 == 0 1006;unspill 1007%if mmsize != 8 1008 xor r4, r4 1009 sub r4, r3 1010%endif 1011 mov rtcos, [esp+8] 1012 mov rtsin, [esp+4] 1013%endif 1014 1015 PREROTATER r4, r3, r2, rtcos, rtsin 1016%if mmsize == 8 1017 mov r6, [esp] ; rrevtab = ptr+n8 1018 movzx r5, word [rrevtab+r4-2] ; rrevtab[j] 1019 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] 1020 mova [r1+r5*8], m0 1021 mova [r1+r6*8], m2 1022 add r4, 2 1023 sub r3, 2 1024%else 1025%if ARCH_X86_64 1026 movzx r5, word [rrevtab+r4-4] 1027 movzx r6, word [rrevtab+r4-2] 1028 movzx r10, word [rrevtab+r3] 1029 movzx r11, word [rrevtab+r3+2] 1030 movlps [r1+r5 *8], xmm0 1031 movhps [r1+r6 *8], xmm0 1032 movlps [r1+r10*8], xmm1 1033 movhps [r1+r11*8], xmm1 1034 add r4, 4 1035%else 1036 mov r6, [esp] 1037 movzx r5, word [r6+r4-4] 1038 movzx r4, word [r6+r4-2] 1039 movlps [r1+r5*8], xmm0 1040 movhps [r1+r4*8], xmm0 1041 movzx r5, word [r6+r3] 1042 movzx r4, word [r6+r3+2] 1043 movlps [r1+r5*8], xmm1 1044 movhps [r1+r4*8], xmm1 1045%endif 1046 sub r3, 4 1047%endif 1048 jns .pre 1049 1050 mov r5, r0 1051 mov r6, r1 1052 mov r0, r1 1053 mov r1d, [r5+FFTContext.nbits] 1054 1055 FFT_DISPATCH SUFFIX, r1 1056 1057 mov r0d, [r5+FFTContext.mdctsize] 1058 add r6, r0 1059 shr r0, 1 1060%if ARCH_X86_64 == 0 1061%define rtcos r2 1062%define rtsin r3 1063 mov rtcos, [esp+8] 1064 mov rtsin, [esp+4] 1065%endif 1066 neg r0 1067 mov r1, -mmsize 1068 sub r1, r0 1069 %1 r0, r1, r6, rtcos, rtsin 1070%if ARCH_X86_64 == 0 1071 add esp, 12 1072%endif 1073%if mmsize == 8 1074 femms 1075%endif 1076 RET 1077%endmacro 1078 1079DECL_IMDCT POSROTATESHUF 1080 1081%if ARCH_X86_32 1082INIT_MMX 3dnow 1083DECL_IMDCT POSROTATESHUF_3DNOW 1084 1085INIT_MMX 3dnowext 1086DECL_IMDCT POSROTATESHUF_3DNOW 1087%endif 1088 1089INIT_YMM avx 1090 1091%if HAVE_AVX_EXTERNAL 1092DECL_IMDCT POSROTATESHUF_AVX 1093%endif 1094