1;******************************************************************************
2;* FFT transform with SSE/3DNow optimizations
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2011 Vitor Sessak
5;*
6;* This algorithm (though not any of the implementation details) is
7;* based on libdjbfft by D. J. Bernstein.
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26; These functions are not individually interchangeable with the C versions.
27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28; in blocks as conventient to the vector size.
29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30
31%include "libavutil/x86/x86util.asm"
32
33%if ARCH_X86_64
34%define pointer resq
35%else
36%define pointer resd
37%endif
38
39struc FFTContext
40    .nbits:    resd 1
41    .reverse:  resd 1
42    .revtab:   pointer 1
43    .tmpbuf:   pointer 1
44    .mdctsize: resd 1
45    .mdctbits: resd 1
46    .tcos:     pointer 1
47    .tsin:     pointer 1
48    .fftperm:  pointer 1
49    .fftcalc:  pointer 1
50    .imdctcalc:pointer 1
51    .imdcthalf:pointer 1
52endstruc
53
54SECTION_RODATA 32
55
56%define M_SQRT1_2 0.70710678118654752440
57%define M_COS_PI_1_8 0.923879532511287
58%define M_COS_PI_3_8 0.38268343236509
59
60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62
63ps_root2: times 8 dd M_SQRT1_2
64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66
67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71ps_m1p1: dd 1<<31, 0
72
73cextern ps_neg
74
75%assign i 16
76%rep 14
77cextern cos_ %+ i
78%assign i i<<1
79%endrep
80
81%if ARCH_X86_64
82    %define pointer dq
83%else
84    %define pointer dd
85%endif
86
87%macro IF0 1+
88%endmacro
89%macro IF1 1+
90    %1
91%endmacro
92
93SECTION .text
94
95%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
96    mova     %1, %3
97    mova     %2, %1
98    pfadd    %1, %4
99    pfsub    %2, %4
100%endmacro
101
102%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
103    mova     %5, %3
104    pfsub    %3, %4
105    pfadd    %5, %4 ; {t6,t5}
106    pxor     %3, [ps_m1p1] ; {t8,t7}
107    mova     %6, %1
108    movd [r0+12], %3
109    punpckhdq %3, [r0+8]
110    pfadd    %1, %5 ; {r0,i0}
111    pfsub    %6, %5 ; {r2,i2}
112    mova     %4, %2
113    pfadd    %2, %3 ; {r1,i1}
114    pfsub    %4, %3 ; {r3,i3}
115    SWAP     %3, %6
116%endmacro
117
118;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
120;      %3, %4, %5 tmp
121; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
123%macro T8_AVX 5
124    vsubps     %5, %1, %2       ; v  = %1 - %2
125    vaddps     %3, %1, %2       ; w  = %1 + %2
126    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
127    vpermilps  %2, %2, [perm1]
128    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130    vsubps     %4, %5, %1       ; s = r - q
131    vaddps     %1, %5, %1       ; u = r + q
132    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
133    vshufps    %5, %4, %1, 0xbb
134    vshufps    %3, %4, %1, 0xee
135    vperm2f128 %3, %3, %5, 0x13
136    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
137    vshufps    %2, %1, %4, 0xdd
138    vshufps    %1, %1, %4, 0x88
139    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
140    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
141    vsubps     %5, %1, %3
142    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143    vsubps     %2, %4, %1       ; %2 = v - w
144    vaddps     %1, %4, %1       ; %1 = v + w
145%endmacro
146
147; In SSE mode do one fft4 transforms
148; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
150;
151; In AVX mode do two fft4 transforms
152; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
154%macro T4_SSE 3
155    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
156    addps    %1, %1, %2       ; {t1,t2,t6,t5}
157    xorps    %3, %3, [ps_p1p1m1p1]
158    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160    subps    %3, %1, %2       ; {r2,i2,r3,i3}
161    addps    %1, %1, %2       ; {r0,i0,r1,i1}
162    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
164%endmacro
165
166; In SSE mode do one FFT8
167; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
169;
170; In AVX mode do two FFT8
171; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
175%macro T8_SSE 6
176    addps    %6, %3, %4       ; {t1,t2,t3,t4}
177    subps    %3, %3, %4       ; {r5,i5,r7,i7}
178    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180    mulps    %4, %4, [ps_root2]
181    addps    %3, %3, %4       ; {t8,t7,ta,t9}
182    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184    subps    %3, %6, %4       ; {t6,t5,tc,tb}
185    addps    %6, %6, %4       ; {t1,t2,t9,ta}
186    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188    subps    %3, %1, %6       ; {r4,r5,r6,r7}
189    addps    %1, %1, %6       ; {r0,r1,r2,r3}
190    subps    %4, %2, %5       ; {i4,i5,i6,i7}
191    addps    %2, %2, %5       ; {i0,i1,i2,i3}
192%endmacro
193
194%macro INTERL 5
195%if cpuflag(avx)
196    vunpckhps      %3, %2, %1
197    vunpcklps      %2, %2, %1
198    vextractf128   %4(%5), %2, 0
199    vextractf128  %4 %+ H(%5), %3, 0
200    vextractf128   %4(%5 + 1), %2, 1
201    vextractf128  %4 %+ H(%5 + 1), %3, 1
202%elif cpuflag(sse) || cpuflag(3dnow)
203    mova     %3, %2
204    unpcklps %2, %1
205    unpckhps %3, %1
206    mova  %4(%5), %2
207    mova  %4(%5+1), %3
208%endif
209%endmacro
210
211; scheduled for cpu-bound sizes
212%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
213IF%1 mova    m4, Z(4)
214IF%1 mova    m5, Z(5)
215    mova     m0, %2 ; wre
216    mova     m1, %3 ; wim
217    mulps    m2, m4, m0 ; r2*wre
218IF%1 mova    m6, Z2(6)
219    mulps    m3, m5, m1 ; i2*wim
220IF%1 mova    m7, Z2(7)
221    mulps    m4, m4, m1 ; r2*wim
222    mulps    m5, m5, m0 ; i2*wre
223    addps    m2, m2, m3 ; r2*wre + i2*wim
224    mulps    m3, m1, m7 ; i3*wim
225    subps    m5, m5, m4 ; i2*wre - r2*wim
226    mulps    m1, m1, m6 ; r3*wim
227    mulps    m4, m0, m6 ; r3*wre
228    mulps    m0, m0, m7 ; i3*wre
229    subps    m4, m4, m3 ; r3*wre - i3*wim
230    mova     m3, Z(0)
231    addps    m0, m0, m1 ; i3*wre + r3*wim
232    subps    m1, m4, m2 ; t3
233    addps    m4, m4, m2 ; t5
234    subps    m3, m3, m4 ; r2
235    addps    m4, m4, Z(0) ; r0
236    mova     m6, Z(2)
237    mova   Z(4), m3
238    mova   Z(0), m4
239    subps    m3, m5, m0 ; t4
240    subps    m4, m6, m3 ; r3
241    addps    m3, m3, m6 ; r1
242    mova  Z2(6), m4
243    mova   Z(2), m3
244    mova     m2, Z(3)
245    addps    m3, m5, m0 ; t6
246    subps    m2, m2, m1 ; i3
247    mova     m7, Z(1)
248    addps    m1, m1, Z(3) ; i1
249    mova  Z2(7), m2
250    mova   Z(3), m1
251    subps    m4, m7, m3 ; i2
252    addps    m3, m3, m7 ; i0
253    mova   Z(5), m4
254    mova   Z(1), m3
255%endmacro
256
257; scheduled to avoid store->load aliasing
258%macro PASS_BIG 1 ; (!interleave)
259    mova     m4, Z(4) ; r2
260    mova     m5, Z(5) ; i2
261    mova     m0, [wq] ; wre
262    mova     m1, [wq+o1q] ; wim
263    mulps    m2, m4, m0 ; r2*wre
264    mova     m6, Z2(6) ; r3
265    mulps    m3, m5, m1 ; i2*wim
266    mova     m7, Z2(7) ; i3
267    mulps    m4, m4, m1 ; r2*wim
268    mulps    m5, m5, m0 ; i2*wre
269    addps    m2, m2, m3 ; r2*wre + i2*wim
270    mulps    m3, m1, m7 ; i3*wim
271    mulps    m1, m1, m6 ; r3*wim
272    subps    m5, m5, m4 ; i2*wre - r2*wim
273    mulps    m4, m0, m6 ; r3*wre
274    mulps    m0, m0, m7 ; i3*wre
275    subps    m4, m4, m3 ; r3*wre - i3*wim
276    mova     m3, Z(0)
277    addps    m0, m0, m1 ; i3*wre + r3*wim
278    subps    m1, m4, m2 ; t3
279    addps    m4, m4, m2 ; t5
280    subps    m3, m3, m4 ; r2
281    addps    m4, m4, Z(0) ; r0
282    mova     m6, Z(2)
283    mova   Z(4), m3
284    mova   Z(0), m4
285    subps    m3, m5, m0 ; t4
286    subps    m4, m6, m3 ; r3
287    addps    m3, m3, m6 ; r1
288IF%1 mova Z2(6), m4
289IF%1 mova  Z(2), m3
290    mova     m2, Z(3)
291    addps    m5, m5, m0 ; t6
292    subps    m2, m2, m1 ; i3
293    mova     m7, Z(1)
294    addps    m1, m1, Z(3) ; i1
295IF%1 mova Z2(7), m2
296IF%1 mova  Z(3), m1
297    subps    m6, m7, m5 ; i2
298    addps    m5, m5, m7 ; i0
299IF%1 mova  Z(5), m6
300IF%1 mova  Z(1), m5
301%if %1==0
302    INTERL m1, m3, m7, Z, 2
303    INTERL m2, m4, m0, Z2, 6
304
305    mova     m1, Z(0)
306    mova     m2, Z(4)
307
308    INTERL m5, m1, m3, Z, 0
309    INTERL m6, m2, m7, Z, 4
310%endif
311%endmacro
312
313%macro PUNPCK 3
314    mova      %3, %1
315    punpckldq %1, %2
316    punpckhdq %3, %2
317%endmacro
318
319%define Z(x) [r0+mmsize*x]
320%define Z2(x) [r0+mmsize*x]
321%define ZH(x) [r0+mmsize*x+mmsize/2]
322
323INIT_YMM avx
324
325%if HAVE_AVX_EXTERNAL
326align 16
327fft8_avx:
328    mova      m0, Z(0)
329    mova      m1, Z(1)
330    T8_AVX    m0, m1, m2, m3, m4
331    mova      Z(0), m0
332    mova      Z(1), m1
333    ret
334
335
336align 16
337fft16_avx:
338    mova       m2, Z(2)
339    mova       m3, Z(3)
340    T4_SSE     m2, m3, m7
341
342    mova       m0, Z(0)
343    mova       m1, Z(1)
344    T8_AVX     m0, m1, m4, m5, m7
345
346    mova       m4, [ps_cos16_1]
347    mova       m5, [ps_cos16_2]
348    vmulps     m6, m2, m4
349    vmulps     m7, m3, m5
350    vaddps     m7, m7, m6
351    vmulps     m2, m2, m5
352    vmulps     m3, m3, m4
353    vsubps     m3, m3, m2
354    vblendps   m2, m7, m3, 0xf0
355    vperm2f128 m3, m7, m3, 0x21
356    vaddps     m4, m2, m3
357    vsubps     m2, m3, m2
358    vperm2f128 m2, m2, m2, 0x01
359    vsubps     m3, m1, m2
360    vaddps     m1, m1, m2
361    vsubps     m5, m0, m4
362    vaddps     m0, m0, m4
363    vextractf128   Z(0), m0, 0
364    vextractf128  ZH(0), m1, 0
365    vextractf128   Z(1), m0, 1
366    vextractf128  ZH(1), m1, 1
367    vextractf128   Z(2), m5, 0
368    vextractf128  ZH(2), m3, 0
369    vextractf128   Z(3), m5, 1
370    vextractf128  ZH(3), m3, 1
371    ret
372
373align 16
374fft32_avx:
375    call fft16_avx
376
377    mova m0, Z(4)
378    mova m1, Z(5)
379
380    T4_SSE      m0, m1, m4
381
382    mova m2, Z(6)
383    mova m3, Z(7)
384
385    T8_SSE      m0, m1, m2, m3, m4, m6
386    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
387    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
388
389    vperm2f128  m4, m0, m2, 0x20
390    vperm2f128  m5, m1, m3, 0x20
391    vperm2f128  m6, m0, m2, 0x31
392    vperm2f128  m7, m1, m3, 0x31
393
394    PASS_SMALL 0, [cos_32], [cos_32+32]
395
396    ret
397
398fft32_interleave_avx:
399    call fft32_avx
400    mov r2d, 32
401.deint_loop:
402    mova     m2, Z(0)
403    mova     m3, Z(1)
404    vunpcklps      m0, m2, m3
405    vunpckhps      m1, m2, m3
406    vextractf128   Z(0), m0, 0
407    vextractf128  ZH(0), m1, 0
408    vextractf128   Z(1), m0, 1
409    vextractf128  ZH(1), m1, 1
410    add r0, mmsize*2
411    sub r2d, mmsize/4
412    jg .deint_loop
413    ret
414
415%endif
416
417INIT_XMM sse
418
419align 16
420fft4_avx:
421fft4_sse:
422    mova     m0, Z(0)
423    mova     m1, Z(1)
424    T4_SSE   m0, m1, m2
425    mova   Z(0), m0
426    mova   Z(1), m1
427    ret
428
429align 16
430fft8_sse:
431    mova     m0, Z(0)
432    mova     m1, Z(1)
433    T4_SSE   m0, m1, m2
434    mova     m2, Z(2)
435    mova     m3, Z(3)
436    T8_SSE   m0, m1, m2, m3, m4, m5
437    mova   Z(0), m0
438    mova   Z(1), m1
439    mova   Z(2), m2
440    mova   Z(3), m3
441    ret
442
443align 16
444fft16_sse:
445    mova     m0, Z(0)
446    mova     m1, Z(1)
447    T4_SSE   m0, m1, m2
448    mova     m2, Z(2)
449    mova     m3, Z(3)
450    T8_SSE   m0, m1, m2, m3, m4, m5
451    mova     m4, Z(4)
452    mova     m5, Z(5)
453    mova   Z(0), m0
454    mova   Z(1), m1
455    mova   Z(2), m2
456    mova   Z(3), m3
457    T4_SSE   m4, m5, m6
458    mova     m6, Z2(6)
459    mova     m7, Z2(7)
460    T4_SSE   m6, m7, m0
461    PASS_SMALL 0, [cos_16], [cos_16+16]
462    ret
463
464
465%macro FFT48_3DNOW 0
466align 16
467fft4 %+ SUFFIX:
468    T2_3DNOW m0, m1, Z(0), Z(1)
469    mova     m2, Z(2)
470    mova     m3, Z(3)
471    T4_3DNOW m0, m1, m2, m3, m4, m5
472    PUNPCK   m0, m1, m4
473    PUNPCK   m2, m3, m5
474    mova   Z(0), m0
475    mova   Z(1), m4
476    mova   Z(2), m2
477    mova   Z(3), m5
478    ret
479
480align 16
481fft8 %+ SUFFIX:
482    T2_3DNOW m0, m1, Z(0), Z(1)
483    mova     m2, Z(2)
484    mova     m3, Z(3)
485    T4_3DNOW m0, m1, m2, m3, m4, m5
486    mova   Z(0), m0
487    mova   Z(2), m2
488    T2_3DNOW m4, m5,  Z(4),  Z(5)
489    T2_3DNOW m6, m7, Z2(6), Z2(7)
490    PSWAPD   m0, m5
491    PSWAPD   m2, m7
492    pxor     m0, [ps_m1p1]
493    pxor     m2, [ps_m1p1]
494    pfsub    m5, m0
495    pfadd    m7, m2
496    pfmul    m5, [ps_root2]
497    pfmul    m7, [ps_root2]
498    T4_3DNOW m1, m3, m5, m7, m0, m2
499    mova   Z(5), m5
500    mova  Z2(7), m7
501    mova     m0, Z(0)
502    mova     m2, Z(2)
503    T4_3DNOW m0, m2, m4, m6, m5, m7
504    PUNPCK   m0, m1, m5
505    PUNPCK   m2, m3, m7
506    mova   Z(0), m0
507    mova   Z(1), m5
508    mova   Z(2), m2
509    mova   Z(3), m7
510    PUNPCK   m4,  Z(5), m5
511    PUNPCK   m6, Z2(7), m7
512    mova   Z(4), m4
513    mova   Z(5), m5
514    mova  Z2(6), m6
515    mova  Z2(7), m7
516    ret
517%endmacro
518
519%if ARCH_X86_32
520INIT_MMX 3dnowext
521FFT48_3DNOW
522
523INIT_MMX 3dnow
524FFT48_3DNOW
525%endif
526
527%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
528%define Z2(x) [zcq + o3q + mmsize*(x&1)]
529%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
530%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
531
532%macro DECL_PASS 2+ ; name, payload
533align 16
534%1:
535DEFINE_ARGS zc, w, n, o1, o3
536    lea o3q, [nq*3]
537    lea o1q, [nq*8]
538    shl o3q, 4
539.loop:
540    %2
541    add zcq, mmsize*2
542    add  wq, mmsize
543    sub  nd, mmsize/8
544    jg .loop
545    rep ret
546%endmacro
547
548%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
549    lea r2, [dispatch_tab%1]
550    mov r2, [r2 + (%2q-2)*gprsize]
551%ifdef PIC
552    lea r3, [$$]
553    add r2, r3
554%endif
555    call r2
556%endmacro ; FFT_DISPATCH
557
558INIT_YMM avx
559
560%if HAVE_AVX_EXTERNAL
561DECL_PASS pass_avx, PASS_BIG 1
562DECL_PASS pass_interleave_avx, PASS_BIG 0
563
564cglobal fft_calc, 2,5,8
565    mov     r3d, [r0 + FFTContext.nbits]
566    mov     r0, r1
567    mov     r1, r3
568    FFT_DISPATCH _interleave %+ SUFFIX, r1
569    REP_RET
570
571%endif
572
573INIT_XMM sse
574
575DECL_PASS pass_sse, PASS_BIG 1
576DECL_PASS pass_interleave_sse, PASS_BIG 0
577
578%macro FFT_CALC_FUNC 0
579cglobal fft_calc, 2,5,8
580    mov     r3d, [r0 + FFTContext.nbits]
581    PUSH    r1
582    PUSH    r3
583    mov     r0, r1
584    mov     r1, r3
585    FFT_DISPATCH _interleave %+ SUFFIX, r1
586    POP     rcx
587    POP     r4
588    cmp     rcx, 3+(mmsize/16)
589    jg      .end
590    mov     r2, -1
591    add     rcx, 3
592    shl     r2, cl
593    sub     r4, r2
594.loop:
595%if mmsize == 8
596    PSWAPD  m0, [r4 + r2 + 4]
597    mova [r4 + r2 + 4], m0
598%else
599    movaps   xmm0, [r4 + r2]
600    movaps   xmm1, xmm0
601    unpcklps xmm0, [r4 + r2 + 16]
602    unpckhps xmm1, [r4 + r2 + 16]
603    movaps   [r4 + r2],      xmm0
604    movaps   [r4 + r2 + 16], xmm1
605%endif
606    add      r2, mmsize*2
607    jl       .loop
608.end:
609%if cpuflag(3dnow)
610    femms
611    RET
612%else
613    REP_RET
614%endif
615%endmacro
616
617%if ARCH_X86_32
618INIT_MMX 3dnow
619FFT_CALC_FUNC
620INIT_MMX 3dnowext
621FFT_CALC_FUNC
622%endif
623INIT_XMM sse
624FFT_CALC_FUNC
625
626cglobal fft_permute, 2,7,1
627    mov     r4,  [r0 + FFTContext.revtab]
628    mov     r5,  [r0 + FFTContext.tmpbuf]
629    mov     ecx, [r0 + FFTContext.nbits]
630    mov     r2, 1
631    shl     r2, cl
632    xor     r0, r0
633%if ARCH_X86_32
634    mov     r1, r1m
635%endif
636.loop:
637    movaps  xmm0, [r1 + 8*r0]
638    movzx   r6, word [r4 + 2*r0]
639    movzx   r3, word [r4 + 2*r0 + 2]
640    movlps  [r5 + 8*r6], xmm0
641    movhps  [r5 + 8*r3], xmm0
642    add     r0, 2
643    cmp     r0, r2
644    jl      .loop
645    shl     r2, 3
646    add     r1, r2
647    add     r5, r2
648    neg     r2
649; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
650.loopcopy:
651    movaps  xmm0, [r5 + r2]
652    movaps  xmm1, [r5 + r2 + 16]
653    movaps  [r1 + r2], xmm0
654    movaps  [r1 + r2 + 16], xmm1
655    add     r2, 32
656    jl      .loopcopy
657    REP_RET
658
659%macro IMDCT_CALC_FUNC 0
660cglobal imdct_calc, 3,5,3
661    mov     r3d, [r0 + FFTContext.mdctsize]
662    mov     r4,  [r0 + FFTContext.imdcthalf]
663    add     r1,  r3
664    PUSH    r3
665    PUSH    r1
666%if ARCH_X86_32
667    push    r2
668    push    r1
669    push    r0
670%else
671    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
672%endif
673    call    r4
674%if ARCH_X86_32
675    add     esp, 12
676%else
677    add     rsp, 8+32*WIN64
678%endif
679    POP     r1
680    POP     r3
681    lea     r0, [r1 + 2*r3]
682    mov     r2, r3
683    sub     r3, mmsize
684    neg     r2
685    mova    m2, [ps_neg]
686.loop:
687%if mmsize == 8
688    PSWAPD  m0, [r1 + r3]
689    PSWAPD  m1, [r0 + r2]
690    pxor    m0, m2
691%else
692    mova    m0, [r1 + r3]
693    mova    m1, [r0 + r2]
694    shufps  m0, m0, 0x1b
695    shufps  m1, m1, 0x1b
696    xorps   m0, m2
697%endif
698    mova [r0 + r3], m1
699    mova [r1 + r2], m0
700    sub     r3, mmsize
701    add     r2, mmsize
702    jl      .loop
703%if cpuflag(3dnow)
704    femms
705    RET
706%else
707    REP_RET
708%endif
709%endmacro
710
711%if ARCH_X86_32
712INIT_MMX 3dnow
713IMDCT_CALC_FUNC
714INIT_MMX 3dnowext
715IMDCT_CALC_FUNC
716%endif
717
718INIT_XMM sse
719IMDCT_CALC_FUNC
720
721%if ARCH_X86_32
722INIT_MMX 3dnow
723%define mulps pfmul
724%define addps pfadd
725%define subps pfsub
726%define unpcklps punpckldq
727%define unpckhps punpckhdq
728DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
729DECL_PASS pass_interleave_3dnow, PASS_BIG 0
730%define pass_3dnowext pass_3dnow
731%define pass_interleave_3dnowext pass_interleave_3dnow
732%endif
733
734%ifdef PIC
735%define SECTION_REL - $$
736%else
737%define SECTION_REL
738%endif
739
740%macro DECL_FFT 1-2 ; nbits, suffix
741%ifidn %0, 1
742%xdefine fullsuffix SUFFIX
743%else
744%xdefine fullsuffix %2 %+ SUFFIX
745%endif
746%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
747%if %1>=5
748%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
749%endif
750%if %1>=6
751%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
752%endif
753
754%assign n 1<<%1
755%rep 18-%1
756%assign n2 n/2
757%assign n4 n/4
758%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
759
760align 16
761fft %+ n %+ fullsuffix:
762    call fft %+ n2 %+ SUFFIX
763    add r0, n*4 - (n&(-2<<%1))
764    call fft %+ n4 %+ SUFFIX
765    add r0, n*2 - (n2&(-2<<%1))
766    call fft %+ n4 %+ SUFFIX
767    sub r0, n*6 + (n2&(-2<<%1))
768    lea r1, [cos_ %+ n]
769    mov r2d, n4/2
770    jmp pass %+ fullsuffix
771
772%assign n n*2
773%endrep
774%undef n
775
776align 8
777dispatch_tab %+ fullsuffix: pointer list_of_fft
778%endmacro ; DECL_FFT
779
780%if HAVE_AVX_EXTERNAL
781INIT_YMM avx
782DECL_FFT 6
783DECL_FFT 6, _interleave
784%endif
785INIT_XMM sse
786DECL_FFT 5
787DECL_FFT 5, _interleave
788%if ARCH_X86_32
789INIT_MMX 3dnow
790DECL_FFT 4
791DECL_FFT 4, _interleave
792INIT_MMX 3dnowext
793DECL_FFT 4
794DECL_FFT 4, _interleave
795%endif
796
797INIT_XMM sse
798%undef mulps
799%undef addps
800%undef subps
801%undef unpcklps
802%undef unpckhps
803
804%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
805%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
806    PSWAPD     m0, [%3+%2*4]
807    movq       m2, [%3+%1*4-8]
808    movq       m3, m0
809    punpckldq  m0, m2
810    punpckhdq  m2, m3
811    movd       m1, [%4+%1*2-4] ; tcos[j]
812    movd       m3, [%4+%2*2]   ; tcos[n4-j-1]
813    punpckldq  m1, [%5+%1*2-4] ; tsin[j]
814    punpckldq  m3, [%5+%2*2]   ; tsin[n4-j-1]
815
816    mova       m4, m0
817    PSWAPD     m5, m1
818    pfmul      m0, m1
819    pfmul      m4, m5
820    mova       m6, m2
821    PSWAPD     m5, m3
822    pfmul      m2, m3
823    pfmul      m6, m5
824%if cpuflag(3dnowext)
825    pfpnacc    m0, m4
826    pfpnacc    m2, m6
827%else
828    SBUTTERFLY dq, 0, 4, 1
829    SBUTTERFLY dq, 2, 6, 3
830    pxor       m4, m7
831    pxor       m6, m7
832    pfadd      m0, m4
833    pfadd      m2, m6
834%endif
835%else
836    movaps   xmm0, [%3+%2*4]
837    movaps   xmm1, [%3+%1*4-0x10]
838    movaps   xmm2, xmm0
839    shufps   xmm0, xmm1, 0x88
840    shufps   xmm1, xmm2, 0x77
841    movlps   xmm4, [%4+%2*2]
842    movlps   xmm5, [%5+%2*2+0x0]
843    movhps   xmm4, [%4+%1*2-0x8]
844    movhps   xmm5, [%5+%1*2-0x8]
845    movaps   xmm2, xmm0
846    movaps   xmm3, xmm1
847    mulps    xmm0, xmm5
848    mulps    xmm1, xmm4
849    mulps    xmm2, xmm4
850    mulps    xmm3, xmm5
851    subps    xmm1, xmm0
852    addps    xmm2, xmm3
853    movaps   xmm0, xmm1
854    unpcklps xmm1, xmm2
855    unpckhps xmm0, xmm2
856%endif
857%endmacro
858
859%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
860%if cpuflag(sse)
861    mulps      m6, %3, [%5+%1]
862    mulps      m7, %2, [%5+%1]
863    mulps      %2, %2, [%6+%1]
864    mulps      %3, %3, [%6+%1]
865    subps      %2, %2, m6
866    addps      %3, %3, m7
867%elif cpuflag(3dnow)
868    mova       m6, [%1+%2*2]
869    mova       %3, [%1+%2*2+8]
870    mova       %4, m6
871    mova       m7, %3
872    pfmul      m6, [%5+%2]
873    pfmul      %3, [%6+%2]
874    pfmul      %4, [%6+%2]
875    pfmul      m7, [%5+%2]
876    pfsub      %3, m6
877    pfadd      %4, m7
878%endif
879%endmacro
880
881%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
882.post:
883%if cpuflag(avx)
884    vmovaps      ymm1,   [%3+%1*2]
885    vmovaps      ymm0,   [%3+%1*2+0x20]
886    vmovaps      ymm3,   [%3+%2*2]
887    vmovaps      ymm2,   [%3+%2*2+0x20]
888
889    CMUL         %1, ymm0, ymm1, %3, %4, %5
890    CMUL         %2, ymm2, ymm3, %3, %4, %5
891    vshufps      ymm1, ymm1, ymm1, 0x1b
892    vshufps      ymm3, ymm3, ymm3, 0x1b
893    vperm2f128   ymm1, ymm1, ymm1, 0x01
894    vperm2f128   ymm3, ymm3, ymm3, 0x01
895    vunpcklps    ymm6, ymm2, ymm1
896    vunpckhps    ymm4, ymm2, ymm1
897    vunpcklps    ymm7, ymm0, ymm3
898    vunpckhps    ymm5, ymm0, ymm3
899
900    vextractf128 [%3+%1*2],      ymm7, 0
901    vextractf128 [%3+%1*2+0x10], ymm5, 0
902    vextractf128 [%3+%1*2+0x20], ymm7, 1
903    vextractf128 [%3+%1*2+0x30], ymm5, 1
904
905    vextractf128 [%3+%2*2],      ymm6, 0
906    vextractf128 [%3+%2*2+0x10], ymm4, 0
907    vextractf128 [%3+%2*2+0x20], ymm6, 1
908    vextractf128 [%3+%2*2+0x30], ymm4, 1
909    sub      %2,   0x20
910    add      %1,   0x20
911    jl       .post
912%elif cpuflag(sse)
913    movaps   xmm1, [%3+%1*2]
914    movaps   xmm0, [%3+%1*2+0x10]
915    CMUL     %1,   xmm0, xmm1, %3, %4, %5
916    movaps   xmm5, [%3+%2*2]
917    movaps   xmm4, [%3+%2*2+0x10]
918    CMUL     %2,   xmm4, xmm5, %3, %4, %5
919    shufps   xmm1, xmm1, 0x1b
920    shufps   xmm5, xmm5, 0x1b
921    movaps   xmm6, xmm4
922    unpckhps xmm4, xmm1
923    unpcklps xmm6, xmm1
924    movaps   xmm2, xmm0
925    unpcklps xmm0, xmm5
926    unpckhps xmm2, xmm5
927    movaps   [%3+%2*2],      xmm6
928    movaps   [%3+%2*2+0x10], xmm4
929    movaps   [%3+%1*2],      xmm0
930    movaps   [%3+%1*2+0x10], xmm2
931    sub      %2,   0x10
932    add      %1,   0x10
933    jl       .post
934%elif cpuflag(3dnow)
935    CMUL  %3, %1, m0, m1, %4, %5
936    CMUL  %3, %2, m2, m3, %4, %5
937    movd  [%3+%1*2+ 0], m0
938    movd  [%3+%2*2+12], m1
939    movd  [%3+%2*2+ 0], m2
940    movd  [%3+%1*2+12], m3
941    psrlq      m0, 32
942    psrlq      m1, 32
943    psrlq      m2, 32
944    psrlq      m3, 32
945    movd  [%3+%1*2+ 8], m0
946    movd  [%3+%2*2+ 4], m1
947    movd  [%3+%2*2+ 8], m2
948    movd  [%3+%1*2+ 4], m3
949    sub        %2, 8
950    add        %1, 8
951    jl         .post
952%endif
953%endmacro
954
955%macro DECL_IMDCT 0
956cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
957%if ARCH_X86_64
958%define rrevtab r7
959%define rtcos   r8
960%define rtsin   r9
961%else
962%define rrevtab r6
963%define rtsin   r6
964%define rtcos   r5
965%endif
966    mov   r3d, [r0+FFTContext.mdctsize]
967    add   r2, r3
968    shr   r3, 1
969    mov   rtcos, [r0+FFTContext.tcos]
970    mov   rtsin, [r0+FFTContext.tsin]
971    add   rtcos, r3
972    add   rtsin, r3
973%if ARCH_X86_64 == 0
974    push  rtcos
975    push  rtsin
976%endif
977    shr   r3, 1
978    mov   rrevtab, [r0+FFTContext.revtab]
979    add   rrevtab, r3
980%if ARCH_X86_64 == 0
981    push  rrevtab
982%endif
983
984%if mmsize == 8
985    sub   r3, 2
986%else
987    sub   r3, 4
988%endif
989%if ARCH_X86_64 || mmsize == 8
990    xor   r4, r4
991    sub   r4, r3
992%endif
993%if notcpuflag(3dnowext) && mmsize == 8
994    movd  m7, [ps_neg]
995%endif
996.pre:
997%if ARCH_X86_64 == 0
998;unspill
999%if mmsize != 8
1000    xor   r4, r4
1001    sub   r4, r3
1002%endif
1003    mov   rtcos, [esp+8]
1004    mov   rtsin, [esp+4]
1005%endif
1006
1007    PREROTATER r4, r3, r2, rtcos, rtsin
1008%if mmsize == 8
1009    mov    r6, [esp]                ; rrevtab = ptr+n8
1010    movzx  r5,  word [rrevtab+r4-2] ; rrevtab[j]
1011    movzx  r6,  word [rrevtab+r3]   ; rrevtab[n4-j-1]
1012    mova [r1+r5*8], m0
1013    mova [r1+r6*8], m2
1014    add    r4, 2
1015    sub    r3, 2
1016%else
1017%if ARCH_X86_64
1018    movzx  r5,  word [rrevtab+r4-4]
1019    movzx  r6,  word [rrevtab+r4-2]
1020    movzx  r10, word [rrevtab+r3]
1021    movzx  r11, word [rrevtab+r3+2]
1022    movlps [r1+r5 *8], xmm0
1023    movhps [r1+r6 *8], xmm0
1024    movlps [r1+r10*8], xmm1
1025    movhps [r1+r11*8], xmm1
1026    add    r4, 4
1027%else
1028    mov    r6, [esp]
1029    movzx  r5, word [r6+r4-4]
1030    movzx  r4, word [r6+r4-2]
1031    movlps [r1+r5*8], xmm0
1032    movhps [r1+r4*8], xmm0
1033    movzx  r5, word [r6+r3]
1034    movzx  r4, word [r6+r3+2]
1035    movlps [r1+r5*8], xmm1
1036    movhps [r1+r4*8], xmm1
1037%endif
1038    sub    r3, 4
1039%endif
1040    jns    .pre
1041
1042    mov  r5, r0
1043    mov  r6, r1
1044    mov  r0, r1
1045    mov  r1d, [r5+FFTContext.nbits]
1046
1047    FFT_DISPATCH SUFFIX, r1
1048
1049    mov  r0d, [r5+FFTContext.mdctsize]
1050    add  r6, r0
1051    shr  r0, 1
1052%if ARCH_X86_64 == 0
1053%define rtcos r2
1054%define rtsin r3
1055    mov  rtcos, [esp+8]
1056    mov  rtsin, [esp+4]
1057%endif
1058    neg  r0
1059    mov  r1, -mmsize
1060    sub  r1, r0
1061    POSROTATESHUF r0, r1, r6, rtcos, rtsin
1062%if ARCH_X86_64 == 0
1063    add esp, 12
1064%endif
1065%if mmsize == 8
1066    femms
1067%endif
1068    RET
1069%endmacro
1070
1071DECL_IMDCT
1072
1073%if ARCH_X86_32
1074INIT_MMX 3dnow
1075DECL_IMDCT
1076
1077INIT_MMX 3dnowext
1078DECL_IMDCT
1079%endif
1080
1081INIT_YMM avx
1082
1083%if HAVE_AVX_EXTERNAL
1084DECL_IMDCT
1085%endif
1086