1;******************************************************************************
2;* FFT transform with SSE/3DNow optimizations
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2011 Vitor Sessak
5;*
6;* This algorithm (though not any of the implementation details) is
7;* based on libdjbfft by D. J. Bernstein.
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26; These functions are not individually interchangeable with the C versions.
27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28; in blocks as conventient to the vector size.
29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30
31%include "libavutil/x86/x86util.asm"
32
33%if ARCH_X86_64
34%define pointer resq
35%else
36%define pointer resd
37%endif
38
39SECTION_RODATA 32
40
41struc FFTContext
42    .nbits:    resd 1
43    .reverse:  resd 1
44    .revtab:   pointer 1
45    .tmpbuf:   pointer 1
46    .mdctsize: resd 1
47    .mdctbits: resd 1
48    .tcos:     pointer 1
49    .tsin:     pointer 1
50    .fftperm:  pointer 1
51    .fftcalc:  pointer 1
52    .imdctcalc:pointer 1
53    .imdcthalf:pointer 1
54endstruc
55
56%define M_SQRT1_2 0.70710678118654752440
57%define M_COS_PI_1_8 0.923879532511287
58%define M_COS_PI_3_8 0.38268343236509
59
60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62
63ps_root2: times 8 dd M_SQRT1_2
64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66
67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71ps_m1p1: dd 1<<31, 0
72
73cextern ps_neg
74
75%assign i 16
76%rep 13
77cextern cos_ %+ i
78%assign i i<<1
79%endrep
80
81%if ARCH_X86_64
82    %define pointer dq
83%else
84    %define pointer dd
85%endif
86
87%macro IF0 1+
88%endmacro
89%macro IF1 1+
90    %1
91%endmacro
92
93SECTION_TEXT
94
95%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
96    mova     %1, %3
97    mova     %2, %1
98    pfadd    %1, %4
99    pfsub    %2, %4
100%endmacro
101
102%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
103    mova     %5, %3
104    pfsub    %3, %4
105    pfadd    %5, %4 ; {t6,t5}
106    pxor     %3, [ps_m1p1] ; {t8,t7}
107    mova     %6, %1
108    movd [r0+12], %3
109    punpckhdq %3, [r0+8]
110    pfadd    %1, %5 ; {r0,i0}
111    pfsub    %6, %5 ; {r2,i2}
112    mova     %4, %2
113    pfadd    %2, %3 ; {r1,i1}
114    pfsub    %4, %3 ; {r3,i3}
115    SWAP     %3, %6
116%endmacro
117
118;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
120;      %3, %4, %5 tmp
121; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
123%macro T8_AVX 5
124    vsubps     %5, %1, %2       ; v  = %1 - %2
125    vaddps     %3, %1, %2       ; w  = %1 + %2
126    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
127    vpermilps  %2, %2, [perm1]
128    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130    vsubps     %4, %5, %1       ; s = r - q
131    vaddps     %1, %5, %1       ; u = r + q
132    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
133    vshufps    %5, %4, %1, 0xbb
134    vshufps    %3, %4, %1, 0xee
135    vperm2f128 %3, %3, %5, 0x13
136    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
137    vshufps    %2, %1, %4, 0xdd
138    vshufps    %1, %1, %4, 0x88
139    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
140    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
141    vsubps     %5, %1, %3
142    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143    vsubps     %2, %4, %1       ; %2 = v - w
144    vaddps     %1, %4, %1       ; %1 = v + w
145%endmacro
146
147; In SSE mode do one fft4 transforms
148; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
150;
151; In AVX mode do two fft4 transforms
152; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
154%macro T4_SSE 3
155    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
156    addps    %1, %1, %2       ; {t1,t2,t6,t5}
157    xorps    %3, %3, [ps_p1p1m1p1]
158    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160    subps    %3, %1, %2       ; {r2,i2,r3,i3}
161    addps    %1, %1, %2       ; {r0,i0,r1,i1}
162    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
164%endmacro
165
166; In SSE mode do one FFT8
167; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
169;
170; In AVX mode do two FFT8
171; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
175%macro T8_SSE 6
176    addps    %6, %3, %4       ; {t1,t2,t3,t4}
177    subps    %3, %3, %4       ; {r5,i5,r7,i7}
178    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180    mulps    %4, %4, [ps_root2]
181    addps    %3, %3, %4       ; {t8,t7,ta,t9}
182    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184    subps    %3, %6, %4       ; {t6,t5,tc,tb}
185    addps    %6, %6, %4       ; {t1,t2,t9,ta}
186    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188    subps    %3, %1, %6       ; {r4,r5,r6,r7}
189    addps    %1, %1, %6       ; {r0,r1,r2,r3}
190    subps    %4, %2, %5       ; {i4,i5,i6,i7}
191    addps    %2, %2, %5       ; {i0,i1,i2,i3}
192%endmacro
193
194; scheduled for cpu-bound sizes
195%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
196IF%1 mova    m4, Z(4)
197IF%1 mova    m5, Z(5)
198    mova     m0, %2 ; wre
199    mova     m1, %3 ; wim
200    mulps    m2, m4, m0 ; r2*wre
201IF%1 mova    m6, Z2(6)
202    mulps    m3, m5, m1 ; i2*wim
203IF%1 mova    m7, Z2(7)
204    mulps    m4, m4, m1 ; r2*wim
205    mulps    m5, m5, m0 ; i2*wre
206    addps    m2, m2, m3 ; r2*wre + i2*wim
207    mulps    m3, m1, m7 ; i3*wim
208    subps    m5, m5, m4 ; i2*wre - r2*wim
209    mulps    m1, m1, m6 ; r3*wim
210    mulps    m4, m0, m6 ; r3*wre
211    mulps    m0, m0, m7 ; i3*wre
212    subps    m4, m4, m3 ; r3*wre - i3*wim
213    mova     m3, Z(0)
214    addps    m0, m0, m1 ; i3*wre + r3*wim
215    subps    m1, m4, m2 ; t3
216    addps    m4, m4, m2 ; t5
217    subps    m3, m3, m4 ; r2
218    addps    m4, m4, Z(0) ; r0
219    mova     m6, Z(2)
220    mova   Z(4), m3
221    mova   Z(0), m4
222    subps    m3, m5, m0 ; t4
223    subps    m4, m6, m3 ; r3
224    addps    m3, m3, m6 ; r1
225    mova  Z2(6), m4
226    mova   Z(2), m3
227    mova     m2, Z(3)
228    addps    m3, m5, m0 ; t6
229    subps    m2, m2, m1 ; i3
230    mova     m7, Z(1)
231    addps    m1, m1, Z(3) ; i1
232    mova  Z2(7), m2
233    mova   Z(3), m1
234    subps    m4, m7, m3 ; i2
235    addps    m3, m3, m7 ; i0
236    mova   Z(5), m4
237    mova   Z(1), m3
238%endmacro
239
240; scheduled to avoid store->load aliasing
241%macro PASS_BIG 1 ; (!interleave)
242    mova     m4, Z(4) ; r2
243    mova     m5, Z(5) ; i2
244    mova     m0, [wq] ; wre
245    mova     m1, [wq+o1q] ; wim
246    mulps    m2, m4, m0 ; r2*wre
247    mova     m6, Z2(6) ; r3
248    mulps    m3, m5, m1 ; i2*wim
249    mova     m7, Z2(7) ; i3
250    mulps    m4, m4, m1 ; r2*wim
251    mulps    m5, m5, m0 ; i2*wre
252    addps    m2, m2, m3 ; r2*wre + i2*wim
253    mulps    m3, m1, m7 ; i3*wim
254    mulps    m1, m1, m6 ; r3*wim
255    subps    m5, m5, m4 ; i2*wre - r2*wim
256    mulps    m4, m0, m6 ; r3*wre
257    mulps    m0, m0, m7 ; i3*wre
258    subps    m4, m4, m3 ; r3*wre - i3*wim
259    mova     m3, Z(0)
260    addps    m0, m0, m1 ; i3*wre + r3*wim
261    subps    m1, m4, m2 ; t3
262    addps    m4, m4, m2 ; t5
263    subps    m3, m3, m4 ; r2
264    addps    m4, m4, Z(0) ; r0
265    mova     m6, Z(2)
266    mova   Z(4), m3
267    mova   Z(0), m4
268    subps    m3, m5, m0 ; t4
269    subps    m4, m6, m3 ; r3
270    addps    m3, m3, m6 ; r1
271IF%1 mova Z2(6), m4
272IF%1 mova  Z(2), m3
273    mova     m2, Z(3)
274    addps    m5, m5, m0 ; t6
275    subps    m2, m2, m1 ; i3
276    mova     m7, Z(1)
277    addps    m1, m1, Z(3) ; i1
278IF%1 mova Z2(7), m2
279IF%1 mova  Z(3), m1
280    subps    m6, m7, m5 ; i2
281    addps    m5, m5, m7 ; i0
282IF%1 mova  Z(5), m6
283IF%1 mova  Z(1), m5
284%if %1==0
285    INTERL m1, m3, m7, Z, 2
286    INTERL m2, m4, m0, Z2, 6
287
288    mova     m1, Z(0)
289    mova     m2, Z(4)
290
291    INTERL m5, m1, m3, Z, 0
292    INTERL m6, m2, m7, Z, 4
293%endif
294%endmacro
295
296%macro PUNPCK 3
297    mova      %3, %1
298    punpckldq %1, %2
299    punpckhdq %3, %2
300%endmacro
301
302%define Z(x) [r0+mmsize*x]
303%define Z2(x) [r0+mmsize*x]
304%define ZH(x) [r0+mmsize*x+mmsize/2]
305
306INIT_YMM avx
307
308%if HAVE_AVX_EXTERNAL
309align 16
310fft8_avx:
311    mova      m0, Z(0)
312    mova      m1, Z(1)
313    T8_AVX    m0, m1, m2, m3, m4
314    mova      Z(0), m0
315    mova      Z(1), m1
316    ret
317
318
319align 16
320fft16_avx:
321    mova       m2, Z(2)
322    mova       m3, Z(3)
323    T4_SSE     m2, m3, m7
324
325    mova       m0, Z(0)
326    mova       m1, Z(1)
327    T8_AVX     m0, m1, m4, m5, m7
328
329    mova       m4, [ps_cos16_1]
330    mova       m5, [ps_cos16_2]
331    vmulps     m6, m2, m4
332    vmulps     m7, m3, m5
333    vaddps     m7, m7, m6
334    vmulps     m2, m2, m5
335    vmulps     m3, m3, m4
336    vsubps     m3, m3, m2
337    vblendps   m2, m7, m3, 0xf0
338    vperm2f128 m3, m7, m3, 0x21
339    vaddps     m4, m2, m3
340    vsubps     m2, m3, m2
341    vperm2f128 m2, m2, m2, 0x01
342    vsubps     m3, m1, m2
343    vaddps     m1, m1, m2
344    vsubps     m5, m0, m4
345    vaddps     m0, m0, m4
346    vextractf128   Z(0), m0, 0
347    vextractf128  ZH(0), m1, 0
348    vextractf128   Z(1), m0, 1
349    vextractf128  ZH(1), m1, 1
350    vextractf128   Z(2), m5, 0
351    vextractf128  ZH(2), m3, 0
352    vextractf128   Z(3), m5, 1
353    vextractf128  ZH(3), m3, 1
354    ret
355
356align 16
357fft32_avx:
358    call fft16_avx
359
360    mova m0, Z(4)
361    mova m1, Z(5)
362
363    T4_SSE      m0, m1, m4
364
365    mova m2, Z(6)
366    mova m3, Z(7)
367
368    T8_SSE      m0, m1, m2, m3, m4, m6
369    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
370    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
371
372    vperm2f128  m4, m0, m2, 0x20
373    vperm2f128  m5, m1, m3, 0x20
374    vperm2f128  m6, m0, m2, 0x31
375    vperm2f128  m7, m1, m3, 0x31
376
377    PASS_SMALL 0, [cos_32], [cos_32+32]
378
379    ret
380
381fft32_interleave_avx:
382    call fft32_avx
383    mov r2d, 32
384.deint_loop:
385    mova     m2, Z(0)
386    mova     m3, Z(1)
387    vunpcklps      m0, m2, m3
388    vunpckhps      m1, m2, m3
389    vextractf128   Z(0), m0, 0
390    vextractf128  ZH(0), m1, 0
391    vextractf128   Z(1), m0, 1
392    vextractf128  ZH(1), m1, 1
393    add r0, mmsize*2
394    sub r2d, mmsize/4
395    jg .deint_loop
396    ret
397
398%endif
399
400INIT_XMM sse
401
402align 16
403fft4_avx:
404fft4_sse:
405    mova     m0, Z(0)
406    mova     m1, Z(1)
407    T4_SSE   m0, m1, m2
408    mova   Z(0), m0
409    mova   Z(1), m1
410    ret
411
412align 16
413fft8_sse:
414    mova     m0, Z(0)
415    mova     m1, Z(1)
416    T4_SSE   m0, m1, m2
417    mova     m2, Z(2)
418    mova     m3, Z(3)
419    T8_SSE   m0, m1, m2, m3, m4, m5
420    mova   Z(0), m0
421    mova   Z(1), m1
422    mova   Z(2), m2
423    mova   Z(3), m3
424    ret
425
426align 16
427fft16_sse:
428    mova     m0, Z(0)
429    mova     m1, Z(1)
430    T4_SSE   m0, m1, m2
431    mova     m2, Z(2)
432    mova     m3, Z(3)
433    T8_SSE   m0, m1, m2, m3, m4, m5
434    mova     m4, Z(4)
435    mova     m5, Z(5)
436    mova   Z(0), m0
437    mova   Z(1), m1
438    mova   Z(2), m2
439    mova   Z(3), m3
440    T4_SSE   m4, m5, m6
441    mova     m6, Z2(6)
442    mova     m7, Z2(7)
443    T4_SSE   m6, m7, m0
444    PASS_SMALL 0, [cos_16], [cos_16+16]
445    ret
446
447
448%macro FFT48_3DNOW 0
449align 16
450fft4 %+ SUFFIX:
451    T2_3DNOW m0, m1, Z(0), Z(1)
452    mova     m2, Z(2)
453    mova     m3, Z(3)
454    T4_3DNOW m0, m1, m2, m3, m4, m5
455    PUNPCK   m0, m1, m4
456    PUNPCK   m2, m3, m5
457    mova   Z(0), m0
458    mova   Z(1), m4
459    mova   Z(2), m2
460    mova   Z(3), m5
461    ret
462
463align 16
464fft8 %+ SUFFIX:
465    T2_3DNOW m0, m1, Z(0), Z(1)
466    mova     m2, Z(2)
467    mova     m3, Z(3)
468    T4_3DNOW m0, m1, m2, m3, m4, m5
469    mova   Z(0), m0
470    mova   Z(2), m2
471    T2_3DNOW m4, m5,  Z(4),  Z(5)
472    T2_3DNOW m6, m7, Z2(6), Z2(7)
473    PSWAPD   m0, m5
474    PSWAPD   m2, m7
475    pxor     m0, [ps_m1p1]
476    pxor     m2, [ps_m1p1]
477    pfsub    m5, m0
478    pfadd    m7, m2
479    pfmul    m5, [ps_root2]
480    pfmul    m7, [ps_root2]
481    T4_3DNOW m1, m3, m5, m7, m0, m2
482    mova   Z(5), m5
483    mova  Z2(7), m7
484    mova     m0, Z(0)
485    mova     m2, Z(2)
486    T4_3DNOW m0, m2, m4, m6, m5, m7
487    PUNPCK   m0, m1, m5
488    PUNPCK   m2, m3, m7
489    mova   Z(0), m0
490    mova   Z(1), m5
491    mova   Z(2), m2
492    mova   Z(3), m7
493    PUNPCK   m4,  Z(5), m5
494    PUNPCK   m6, Z2(7), m7
495    mova   Z(4), m4
496    mova   Z(5), m5
497    mova  Z2(6), m6
498    mova  Z2(7), m7
499    ret
500%endmacro
501
502%if ARCH_X86_32
503INIT_MMX 3dnowext
504FFT48_3DNOW
505
506INIT_MMX 3dnow
507FFT48_3DNOW
508%endif
509
510%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
511%define Z2(x) [zcq + o3q + mmsize*(x&1)]
512%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
513%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
514
515%macro DECL_PASS 2+ ; name, payload
516align 16
517%1:
518DEFINE_ARGS zc, w, n, o1, o3
519    lea o3q, [nq*3]
520    lea o1q, [nq*8]
521    shl o3q, 4
522.loop:
523    %2
524    add zcq, mmsize*2
525    add  wq, mmsize
526    sub  nd, mmsize/8
527    jg .loop
528    rep ret
529%endmacro
530
531%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
532    lea r2, [dispatch_tab%1]
533    mov r2, [r2 + (%2q-2)*gprsize]
534%ifdef PIC
535    lea r3, [$$]
536    add r2, r3
537%endif
538    call r2
539%endmacro ; FFT_DISPATCH
540
541INIT_YMM avx
542
543%if HAVE_AVX_EXTERNAL
544%macro INTERL_AVX 5
545    vunpckhps      %3, %2, %1
546    vunpcklps      %2, %2, %1
547    vextractf128   %4(%5), %2, 0
548    vextractf128  %4 %+ H(%5), %3, 0
549    vextractf128   %4(%5 + 1), %2, 1
550    vextractf128  %4 %+ H(%5 + 1), %3, 1
551%endmacro
552
553%define INTERL INTERL_AVX
554
555DECL_PASS pass_avx, PASS_BIG 1
556DECL_PASS pass_interleave_avx, PASS_BIG 0
557
558cglobal fft_calc, 2,5,8
559    mov     r3d, [r0 + FFTContext.nbits]
560    mov     r0, r1
561    mov     r1, r3
562    FFT_DISPATCH _interleave %+ SUFFIX, r1
563    REP_RET
564
565%endif
566
567INIT_XMM sse
568
569%macro INTERL_SSE 5
570    mova     %3, %2
571    unpcklps %2, %1
572    unpckhps %3, %1
573    mova  %4(%5), %2
574    mova  %4(%5+1), %3
575%endmacro
576
577%define INTERL INTERL_SSE
578
579DECL_PASS pass_sse, PASS_BIG 1
580DECL_PASS pass_interleave_sse, PASS_BIG 0
581
582%macro FFT_CALC_FUNC 0
583cglobal fft_calc, 2,5,8
584    mov     r3d, [r0 + FFTContext.nbits]
585    PUSH    r1
586    PUSH    r3
587    mov     r0, r1
588    mov     r1, r3
589    FFT_DISPATCH _interleave %+ SUFFIX, r1
590    POP     rcx
591    POP     r4
592    cmp     rcx, 3+(mmsize/16)
593    jg      .end
594    mov     r2, -1
595    add     rcx, 3
596    shl     r2, cl
597    sub     r4, r2
598.loop:
599%if mmsize == 8
600    PSWAPD  m0, [r4 + r2 + 4]
601    mova [r4 + r2 + 4], m0
602%else
603    movaps   xmm0, [r4 + r2]
604    movaps   xmm1, xmm0
605    unpcklps xmm0, [r4 + r2 + 16]
606    unpckhps xmm1, [r4 + r2 + 16]
607    movaps   [r4 + r2],      xmm0
608    movaps   [r4 + r2 + 16], xmm1
609%endif
610    add      r2, mmsize*2
611    jl       .loop
612.end:
613%if cpuflag(3dnow)
614    femms
615    RET
616%else
617    REP_RET
618%endif
619%endmacro
620
621%if ARCH_X86_32
622INIT_MMX 3dnow
623FFT_CALC_FUNC
624INIT_MMX 3dnowext
625FFT_CALC_FUNC
626%endif
627INIT_XMM sse
628FFT_CALC_FUNC
629
630cglobal fft_permute, 2,7,1
631    mov     r4,  [r0 + FFTContext.revtab]
632    mov     r5,  [r0 + FFTContext.tmpbuf]
633    mov     ecx, [r0 + FFTContext.nbits]
634    mov     r2, 1
635    shl     r2, cl
636    xor     r0, r0
637%if ARCH_X86_32
638    mov     r1, r1m
639%endif
640.loop:
641    movaps  xmm0, [r1 + 8*r0]
642    movzx   r6, word [r4 + 2*r0]
643    movzx   r3, word [r4 + 2*r0 + 2]
644    movlps  [r5 + 8*r6], xmm0
645    movhps  [r5 + 8*r3], xmm0
646    add     r0, 2
647    cmp     r0, r2
648    jl      .loop
649    shl     r2, 3
650    add     r1, r2
651    add     r5, r2
652    neg     r2
653; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
654.loopcopy:
655    movaps  xmm0, [r5 + r2]
656    movaps  xmm1, [r5 + r2 + 16]
657    movaps  [r1 + r2], xmm0
658    movaps  [r1 + r2 + 16], xmm1
659    add     r2, 32
660    jl      .loopcopy
661    REP_RET
662
663%macro IMDCT_CALC_FUNC 0
664cglobal imdct_calc, 3,5,3
665    mov     r3d, [r0 + FFTContext.mdctsize]
666    mov     r4,  [r0 + FFTContext.imdcthalf]
667    add     r1,  r3
668    PUSH    r3
669    PUSH    r1
670%if ARCH_X86_32
671    push    r2
672    push    r1
673    push    r0
674%else
675    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
676%endif
677    call    r4
678%if ARCH_X86_32
679    add     esp, 12
680%else
681    add     rsp, 8+32*WIN64
682%endif
683    POP     r1
684    POP     r3
685    lea     r0, [r1 + 2*r3]
686    mov     r2, r3
687    sub     r3, mmsize
688    neg     r2
689    mova    m2, [ps_neg]
690.loop:
691%if mmsize == 8
692    PSWAPD  m0, [r1 + r3]
693    PSWAPD  m1, [r0 + r2]
694    pxor    m0, m2
695%else
696    mova    m0, [r1 + r3]
697    mova    m1, [r0 + r2]
698    shufps  m0, m0, 0x1b
699    shufps  m1, m1, 0x1b
700    xorps   m0, m2
701%endif
702    mova [r0 + r3], m1
703    mova [r1 + r2], m0
704    sub     r3, mmsize
705    add     r2, mmsize
706    jl      .loop
707%if cpuflag(3dnow)
708    femms
709    RET
710%else
711    REP_RET
712%endif
713%endmacro
714
715%if ARCH_X86_32
716INIT_MMX 3dnow
717IMDCT_CALC_FUNC
718INIT_MMX 3dnowext
719IMDCT_CALC_FUNC
720%endif
721
722INIT_XMM sse
723IMDCT_CALC_FUNC
724
725%if ARCH_X86_32
726INIT_MMX 3dnow
727%define mulps pfmul
728%define addps pfadd
729%define subps pfsub
730%define unpcklps punpckldq
731%define unpckhps punpckhdq
732DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
733DECL_PASS pass_interleave_3dnow, PASS_BIG 0
734%define pass_3dnowext pass_3dnow
735%define pass_interleave_3dnowext pass_interleave_3dnow
736%endif
737
738%ifdef PIC
739%define SECTION_REL - $$
740%else
741%define SECTION_REL
742%endif
743
744%macro DECL_FFT 1-2 ; nbits, suffix
745%ifidn %0, 1
746%xdefine fullsuffix SUFFIX
747%else
748%xdefine fullsuffix %2 %+ SUFFIX
749%endif
750%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
751%if %1>=5
752%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
753%endif
754%if %1>=6
755%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
756%endif
757
758%assign n 1<<%1
759%rep 17-%1
760%assign n2 n/2
761%assign n4 n/4
762%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
763
764align 16
765fft %+ n %+ fullsuffix:
766    call fft %+ n2 %+ SUFFIX
767    add r0, n*4 - (n&(-2<<%1))
768    call fft %+ n4 %+ SUFFIX
769    add r0, n*2 - (n2&(-2<<%1))
770    call fft %+ n4 %+ SUFFIX
771    sub r0, n*6 + (n2&(-2<<%1))
772    lea r1, [cos_ %+ n]
773    mov r2d, n4/2
774    jmp pass %+ fullsuffix
775
776%assign n n*2
777%endrep
778%undef n
779
780align 8
781dispatch_tab %+ fullsuffix: pointer list_of_fft
782%endmacro ; DECL_FFT
783
784%if HAVE_AVX_EXTERNAL
785INIT_YMM avx
786DECL_FFT 6
787DECL_FFT 6, _interleave
788%endif
789INIT_XMM sse
790DECL_FFT 5
791DECL_FFT 5, _interleave
792%if ARCH_X86_32
793INIT_MMX 3dnow
794DECL_FFT 4
795DECL_FFT 4, _interleave
796INIT_MMX 3dnowext
797DECL_FFT 4
798DECL_FFT 4, _interleave
799%endif
800
801INIT_XMM sse
802%undef mulps
803%undef addps
804%undef subps
805%undef unpcklps
806%undef unpckhps
807
808%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
809%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
810    PSWAPD     m0, [%3+%2*4]
811    movq       m2, [%3+%1*4-8]
812    movq       m3, m0
813    punpckldq  m0, m2
814    punpckhdq  m2, m3
815    movd       m1, [%4+%1*2-4] ; tcos[j]
816    movd       m3, [%4+%2*2]   ; tcos[n4-j-1]
817    punpckldq  m1, [%5+%1*2-4] ; tsin[j]
818    punpckldq  m3, [%5+%2*2]   ; tsin[n4-j-1]
819
820    mova       m4, m0
821    PSWAPD     m5, m1
822    pfmul      m0, m1
823    pfmul      m4, m5
824    mova       m6, m2
825    PSWAPD     m5, m3
826    pfmul      m2, m3
827    pfmul      m6, m5
828%if cpuflag(3dnowext)
829    pfpnacc    m0, m4
830    pfpnacc    m2, m6
831%else
832    SBUTTERFLY dq, 0, 4, 1
833    SBUTTERFLY dq, 2, 6, 3
834    pxor       m4, m7
835    pxor       m6, m7
836    pfadd      m0, m4
837    pfadd      m2, m6
838%endif
839%else
840    movaps   xmm0, [%3+%2*4]
841    movaps   xmm1, [%3+%1*4-0x10]
842    movaps   xmm2, xmm0
843    shufps   xmm0, xmm1, 0x88
844    shufps   xmm1, xmm2, 0x77
845    movlps   xmm4, [%4+%2*2]
846    movlps   xmm5, [%5+%2*2+0x0]
847    movhps   xmm4, [%4+%1*2-0x8]
848    movhps   xmm5, [%5+%1*2-0x8]
849    movaps   xmm2, xmm0
850    movaps   xmm3, xmm1
851    mulps    xmm0, xmm5
852    mulps    xmm1, xmm4
853    mulps    xmm2, xmm4
854    mulps    xmm3, xmm5
855    subps    xmm1, xmm0
856    addps    xmm2, xmm3
857    movaps   xmm0, xmm1
858    unpcklps xmm1, xmm2
859    unpckhps xmm0, xmm2
860%endif
861%endmacro
862
863%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
864    mulps      m6, %3, [%5+%1]
865    mulps      m7, %2, [%5+%1]
866    mulps      %2, %2, [%6+%1]
867    mulps      %3, %3, [%6+%1]
868    subps      %2, %2, m6
869    addps      %3, %3, m7
870%endmacro
871
872%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
873.post:
874    vmovaps      ymm1,   [%3+%1*2]
875    vmovaps      ymm0,   [%3+%1*2+0x20]
876    vmovaps      ymm3,   [%3+%2*2]
877    vmovaps      ymm2,   [%3+%2*2+0x20]
878
879    CMUL         %1, ymm0, ymm1, %3, %4, %5
880    CMUL         %2, ymm2, ymm3, %3, %4, %5
881    vshufps      ymm1, ymm1, ymm1, 0x1b
882    vshufps      ymm3, ymm3, ymm3, 0x1b
883    vperm2f128   ymm1, ymm1, ymm1, 0x01
884    vperm2f128   ymm3, ymm3, ymm3, 0x01
885    vunpcklps    ymm6, ymm2, ymm1
886    vunpckhps    ymm4, ymm2, ymm1
887    vunpcklps    ymm7, ymm0, ymm3
888    vunpckhps    ymm5, ymm0, ymm3
889
890    vextractf128 [%3+%1*2],      ymm7, 0
891    vextractf128 [%3+%1*2+0x10], ymm5, 0
892    vextractf128 [%3+%1*2+0x20], ymm7, 1
893    vextractf128 [%3+%1*2+0x30], ymm5, 1
894
895    vextractf128 [%3+%2*2],      ymm6, 0
896    vextractf128 [%3+%2*2+0x10], ymm4, 0
897    vextractf128 [%3+%2*2+0x20], ymm6, 1
898    vextractf128 [%3+%2*2+0x30], ymm4, 1
899    sub      %2,   0x20
900    add      %1,   0x20
901    jl       .post
902%endmacro
903
904%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
905.post:
906    movaps   xmm1, [%3+%1*2]
907    movaps   xmm0, [%3+%1*2+0x10]
908    CMUL     %1,   xmm0, xmm1, %3, %4, %5
909    movaps   xmm5, [%3+%2*2]
910    movaps   xmm4, [%3+%2*2+0x10]
911    CMUL     %2,   xmm4, xmm5, %3, %4, %5
912    shufps   xmm1, xmm1, 0x1b
913    shufps   xmm5, xmm5, 0x1b
914    movaps   xmm6, xmm4
915    unpckhps xmm4, xmm1
916    unpcklps xmm6, xmm1
917    movaps   xmm2, xmm0
918    unpcklps xmm0, xmm5
919    unpckhps xmm2, xmm5
920    movaps   [%3+%2*2],      xmm6
921    movaps   [%3+%2*2+0x10], xmm4
922    movaps   [%3+%1*2],      xmm0
923    movaps   [%3+%1*2+0x10], xmm2
924    sub      %2,   0x10
925    add      %1,   0x10
926    jl       .post
927%endmacro
928
929%macro CMUL_3DNOW 6
930    mova       m6, [%1+%2*2]
931    mova       %3, [%1+%2*2+8]
932    mova       %4, m6
933    mova       m7, %3
934    pfmul      m6, [%5+%2]
935    pfmul      %3, [%6+%2]
936    pfmul      %4, [%6+%2]
937    pfmul      m7, [%5+%2]
938    pfsub      %3, m6
939    pfadd      %4, m7
940%endmacro
941
942%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
943.post:
944    CMUL_3DNOW %3, %1, m0, m1, %4, %5
945    CMUL_3DNOW %3, %2, m2, m3, %4, %5
946    movd  [%3+%1*2+ 0], m0
947    movd  [%3+%2*2+12], m1
948    movd  [%3+%2*2+ 0], m2
949    movd  [%3+%1*2+12], m3
950    psrlq      m0, 32
951    psrlq      m1, 32
952    psrlq      m2, 32
953    psrlq      m3, 32
954    movd  [%3+%1*2+ 8], m0
955    movd  [%3+%2*2+ 4], m1
956    movd  [%3+%2*2+ 8], m2
957    movd  [%3+%1*2+ 4], m3
958    sub        %2, 8
959    add        %1, 8
960    jl         .post
961%endmacro
962
963%macro DECL_IMDCT 1
964cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
965%if ARCH_X86_64
966%define rrevtab r7
967%define rtcos   r8
968%define rtsin   r9
969%else
970%define rrevtab r6
971%define rtsin   r6
972%define rtcos   r5
973%endif
974    mov   r3d, [r0+FFTContext.mdctsize]
975    add   r2, r3
976    shr   r3, 1
977    mov   rtcos, [r0+FFTContext.tcos]
978    mov   rtsin, [r0+FFTContext.tsin]
979    add   rtcos, r3
980    add   rtsin, r3
981%if ARCH_X86_64 == 0
982    push  rtcos
983    push  rtsin
984%endif
985    shr   r3, 1
986    mov   rrevtab, [r0+FFTContext.revtab]
987    add   rrevtab, r3
988%if ARCH_X86_64 == 0
989    push  rrevtab
990%endif
991
992%if mmsize == 8
993    sub   r3, 2
994%else
995    sub   r3, 4
996%endif
997%if ARCH_X86_64 || mmsize == 8
998    xor   r4, r4
999    sub   r4, r3
1000%endif
1001%if notcpuflag(3dnowext) && mmsize == 8
1002    movd  m7, [ps_neg]
1003%endif
1004.pre:
1005%if ARCH_X86_64 == 0
1006;unspill
1007%if mmsize != 8
1008    xor   r4, r4
1009    sub   r4, r3
1010%endif
1011    mov   rtcos, [esp+8]
1012    mov   rtsin, [esp+4]
1013%endif
1014
1015    PREROTATER r4, r3, r2, rtcos, rtsin
1016%if mmsize == 8
1017    mov    r6, [esp]                ; rrevtab = ptr+n8
1018    movzx  r5,  word [rrevtab+r4-2] ; rrevtab[j]
1019    movzx  r6,  word [rrevtab+r3]   ; rrevtab[n4-j-1]
1020    mova [r1+r5*8], m0
1021    mova [r1+r6*8], m2
1022    add    r4, 2
1023    sub    r3, 2
1024%else
1025%if ARCH_X86_64
1026    movzx  r5,  word [rrevtab+r4-4]
1027    movzx  r6,  word [rrevtab+r4-2]
1028    movzx  r10, word [rrevtab+r3]
1029    movzx  r11, word [rrevtab+r3+2]
1030    movlps [r1+r5 *8], xmm0
1031    movhps [r1+r6 *8], xmm0
1032    movlps [r1+r10*8], xmm1
1033    movhps [r1+r11*8], xmm1
1034    add    r4, 4
1035%else
1036    mov    r6, [esp]
1037    movzx  r5, word [r6+r4-4]
1038    movzx  r4, word [r6+r4-2]
1039    movlps [r1+r5*8], xmm0
1040    movhps [r1+r4*8], xmm0
1041    movzx  r5, word [r6+r3]
1042    movzx  r4, word [r6+r3+2]
1043    movlps [r1+r5*8], xmm1
1044    movhps [r1+r4*8], xmm1
1045%endif
1046    sub    r3, 4
1047%endif
1048    jns    .pre
1049
1050    mov  r5, r0
1051    mov  r6, r1
1052    mov  r0, r1
1053    mov  r1d, [r5+FFTContext.nbits]
1054
1055    FFT_DISPATCH SUFFIX, r1
1056
1057    mov  r0d, [r5+FFTContext.mdctsize]
1058    add  r6, r0
1059    shr  r0, 1
1060%if ARCH_X86_64 == 0
1061%define rtcos r2
1062%define rtsin r3
1063    mov  rtcos, [esp+8]
1064    mov  rtsin, [esp+4]
1065%endif
1066    neg  r0
1067    mov  r1, -mmsize
1068    sub  r1, r0
1069    %1 r0, r1, r6, rtcos, rtsin
1070%if ARCH_X86_64 == 0
1071    add esp, 12
1072%endif
1073%if mmsize == 8
1074    femms
1075%endif
1076    RET
1077%endmacro
1078
1079DECL_IMDCT POSROTATESHUF
1080
1081%if ARCH_X86_32
1082INIT_MMX 3dnow
1083DECL_IMDCT POSROTATESHUF_3DNOW
1084
1085INIT_MMX 3dnowext
1086DECL_IMDCT POSROTATESHUF_3DNOW
1087%endif
1088
1089INIT_YMM avx
1090
1091%if HAVE_AVX_EXTERNAL
1092DECL_IMDCT POSROTATESHUF_AVX
1093%endif
1094